Merge branch 'MON-459_bump_hcl2' into 'master'

MON-459: Bump to HCL 2 syntax / tf 0.12.3

Closes MON-459

See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!64
This commit is contained in:
Quentin Manfroi 2019-07-05 11:11:53 +02:00
commit 026c32ee94
294 changed files with 4890 additions and 5757 deletions

View File

@ -11,7 +11,7 @@ before_script:
- apk add --no-cache curl - apk add --no-cache curl
auto_update: auto_update:
image: hashicorp/terraform:0.11.14 image: hashicorp/terraform:0.12.3
stage: test stage: test
script: script:
- apk add --no-cache bash git grep coreutils - apk add --no-cache bash git grep coreutils

106
README.md
View File

@ -2,8 +2,7 @@
This repository is used to store all our monitors templates ready to use for generic purpose. This repository is used to store all our monitors templates ready to use for generic purpose.
## How to contribute ? ##
### How to contribute ? ###
First, you may refresh your knowledge and look at the [terminology](https://confluence.fr.clara.net/display/DAT/Getting+started). First, you may refresh your knowledge and look at the [terminology](https://confluence.fr.clara.net/display/DAT/Getting+started).
@ -13,7 +12,7 @@ If you would like to resolve an issue or implement new monitors you must follow
After any change you should run `./scripts/auto_update.sh ./` command to make sure all is up to date else the CI pipeline will fail on the branch. After any change you should run `./scripts/auto_update.sh ./` command to make sure all is up to date else the CI pipeline will fail on the branch.
### Important notes ### ## Important notes ##
* This repository represents a terraform feature and each first level directory could be imported as a terraform module, you must choose the one(s) you need. * This repository represents a terraform feature and each first level directory could be imported as a terraform module, you must choose the one(s) you need.
* Each of these modules contains the most commons monitors, but they probably do not fulfill all your customer needs * Each of these modules contains the most commons monitors, but they probably do not fulfill all your customer needs
@ -21,54 +20,107 @@ After any change you should run `./scripts/auto_update.sh ./` command to make su
* You will find a complete `README.md` on each module, explaining how to use it. * You will find a complete `README.md` on each module, explaining how to use it.
* The `alerting-message` module could be used to easily generate a templating message to use by default but it could be used also multiple times to generate messages for specific monitors. * The `alerting-message` module could be used to easily generate a templating message to use by default but it could be used also multiple times to generate messages for specific monitors.
### The DataDog provider ### ## Getting started ##
Before importing some modules, you must define the DataDog provider in your `main.tf` ### Terraform ###
Version >= 0.12 is required to use these modules of monitors.
```
terraform {
required_version = "~> 0.12"
}
```
### DataDog provider ###
Here is the last tester terraform provider version for datadog but next versions should work too.
``` ```
provider "datadog" { provider "datadog" {
version = "2.0.2" version = "2.0.2" # last tested working version
api_key = "${var.datadog_api_key}" api_key = var.datadog_api_key
app_key = "${var.datadog_app_key}" app_key = var.datadog_app_key
} }
``` ```
Both of the `datadog_api_key` and `datadog_app_key` are unique to the client. Both of the `datadog_api_key` and `datadog_app_key` are unique to the each datadog account. You can define them in `terraform.tfvars` file:
### Module declaration example ###
A quick example of using a set of monitors for a given terraform module:
``` ```
variable "oncall_24x7" { datadog_api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
default = "@pagerduty-Public_Cloud_FR_-_Yoda_-_Unibail_HNO" datadog_app_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
```
### Variables ###
Some variables need to be declared.
```
variable "environment" {
type = string
default = "dev"
} }
variable "oncall_office_hours" { variable "datadog_api_key" {
default = "@pagerduty-Public_Cloud_FR_-_Yoda_-_Unibail_HO" type = string
} }
variable "oncall_nodata" { variable "datadog_app_key" {
default = "@pagerduty-Public_Cloud_FR_-_Yoda_-_Unibail_HNO" type = string
}
```
## Modules declaration example ##
A quick example of alerting message module declaration:
```
locals {
oncall_24x7 = "@pagerduty-MyPagerService_NBH"
oncall_office_hours = "@pagerduty-MyPagerService_BH"
} }
module "datadog-message-alerting" { module "datadog-message-alerting" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//common/alerting-message" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//common/alerting-message?ref={RELEASE}"
message_alert = "${var.oncall_24x7}" message_alert = local.oncall_24x7
message_warning = "${var.oncall_office_hours}" message_warning = local.oncall_office_hours
message_nodata = "${var.oncall_nodata}" message_nodata = local.oncall_24x7
} }
module "datadog-monitors-my-monitors-set" { module "datadog-message-alerting-bh-only" {
source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//my/monitors/set?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//common/alerting-message?ref={RELEASE}"
environment = "${var.environment}" message_alert = local.oncall_office_hours
message = "${module.datadog-message-alerting.alerting-message}" message_warning = local.oncall_office_hours
message_nodata = local.oncall_office_hours
} }
module "datadog-monitors-system-generic" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//system/generic?ref={RELEASE}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
memory_message = module.datadog-message-alerting-bh-only.alerting-message
# Use variables to customize monitors configuration
}
# Other monitors modules to declare ...
#module "datadog-monitors-my-monitors-set" {
# source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//my/monitors/set?ref={RELEASE}"
#
# environment = var.environment
# message = module.datadog-message-alerting.alerting-message
#}
``` ```
Replace `{revision}` to the last git tag available on this repository.
The `//` is very important, it's a terraform specific syntax used to separate git url and folder path. The `//` is very important, it's a terraform specific syntax used to separate git url and folder path.
`my/monitors/set` represents the path to a monitors set sub directory listed below. `my/monitors/set` represents the path to a monitors set sub directory listed below.

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-ark" { module "datadog-monitors-caas-kubernetes-ark" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/ark?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/ark?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -23,7 +23,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| ark\_schedules\_enabled | Flag to enable Ark schedules monitor | string | `"true"` | no | | ark\_schedules\_enabled | Flag to enable Ark schedules monitor | string | `"true"` | no |
| ark\_schedules\_extra\_tags | Extra tags for Ark schedules monitor | list | `[]` | no | | ark\_schedules\_extra\_tags | Extra tags for Ark schedules monitor | list(string) | `[]` | no |
| ark\_schedules\_monitor\_message | Custom message for Ark schedules monitor | string | `""` | no | | ark\_schedules\_monitor\_message | Custom message for Ark schedules monitor | string | `""` | no |
| ark\_schedules\_monitor\_no\_data\_timeframe | No data timeframe in minutes | string | `"1440"` | no | | ark\_schedules\_monitor\_no\_data\_timeframe | No data timeframe in minutes | string | `"1440"` | no |
| ark\_schedules\_monitor\_timeframe | Monitor timeframe for Ark schedules monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1d"` | no | | ark\_schedules\_monitor\_timeframe | Monitor timeframe for Ark schedules monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1d"` | no |

View File

@ -42,25 +42,25 @@ variable "prefix_slug" {
variable "ark_schedules_monitor_message" { variable "ark_schedules_monitor_message" {
description = "Custom message for Ark schedules monitor" description = "Custom message for Ark schedules monitor"
type = "string" type = string
default = "" default = ""
} }
variable "ark_schedules_monitor_timeframe" { variable "ark_schedules_monitor_timeframe" {
description = "Monitor timeframe for Ark schedules monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Ark schedules monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_1d" default = "last_1d"
} }
variable "ark_schedules_enabled" { variable "ark_schedules_enabled" {
description = "Flag to enable Ark schedules monitor" description = "Flag to enable Ark schedules monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "ark_schedules_extra_tags" { variable "ark_schedules_extra_tags" {
description = "Extra tags for Ark schedules monitor" description = "Extra tags for Ark schedules monitor"
type = "list" type = list(string)
default = [] default = []
} }
@ -68,3 +68,4 @@ variable "ark_schedules_monitor_no_data_timeframe" {
description = "No data timeframe in minutes" description = "No data timeframe in minutes"
default = 1440 default = 1440
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "ark" resource = "ark"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,21 +1,21 @@
resource "datadog_monitor" "ark_schedules_monitor" { resource "datadog_monitor" "ark_schedules_monitor" {
count = "${var.ark_schedules_enabled == "true" ? 1 : 0}" count = var.ark_schedules_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Ark backup failed" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Ark backup failed"
type = "query alert" type = "query alert"
message = "${coalesce(var.ark_schedules_monitor_message, var.message)}" message = coalesce(var.ark_schedules_monitor_message, var.message)
query = <<EOQ query = <<EOQ
sum(${var.ark_schedules_monitor_timeframe}):min:ark.ark_backup_failure_total${module.filter-tags.query_alert} by {schedule}.as_count() > 1 sum(${var.ark_schedules_monitor_timeframe}):min:ark.ark_backup_failure_total${module.filter-tags.query_alert} by {schedule}.as_count() > 1
EOQ EOQ
thresholds { thresholds = {
critical = 1 critical = 1
warning = 0 warning = 0
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
no_data_timeframe = "${var.ark_schedules_monitor_no_data_timeframe}" no_data_timeframe = var.ark_schedules_monitor_no_data_timeframe
notify_no_data = true notify_no_data = true
renotify_interval = 0 renotify_interval = 0
@ -25,5 +25,6 @@ resource "datadog_monitor" "ark_schedules_monitor" {
locked = false locked = false
require_full_window = false require_full_window = false
tags = ["env:${var.environment}", "type:caas", "provider:prometheus", "resource:ark", "team:claranet", "created-by:terraform", "${var.ark_schedules_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:prometheus", "resource:ark", "team:claranet", "created-by:terraform"], var.ark_schedules_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "ark_schedules_monitor_id" { output "ark_schedules_monitor_id" {
description = "id for monitor ark_schedules_monitor" description = "id for monitor ark_schedules_monitor"
value = "${datadog_monitor.ark_schedules_monitor.*.id}" value = datadog_monitor.ark_schedules_monitor.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-cluster" { module "datadog-monitors-caas-kubernetes-cluster" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/cluster?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/cluster?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -23,7 +23,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| apiserver\_enabled | Flag to enable API server monitor | string | `"true"` | no | | apiserver\_enabled | Flag to enable API server monitor | string | `"true"` | no |
| apiserver\_extra\_tags | Extra tags for API server monitor | list | `[]` | no | | apiserver\_extra\_tags | Extra tags for API server monitor | list(string) | `[]` | no |
| apiserver\_message | Custom message for API server monitor | string | `""` | no | | apiserver\_message | Custom message for API server monitor | string | `""` | no |
| apiserver\_threshold\_warning | API server monitor (warning threshold) | string | `"3"` | no | | apiserver\_threshold\_warning | API server monitor (warning threshold) | string | `"3"` | no |
| environment | Architecture environment | string | n/a | yes | | environment | Architecture environment | string | n/a | yes |

View File

@ -42,24 +42,25 @@ variable "prefix_slug" {
variable "apiserver_enabled" { variable "apiserver_enabled" {
description = "Flag to enable API server monitor" description = "Flag to enable API server monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "apiserver_extra_tags" { variable "apiserver_extra_tags" {
description = "Extra tags for API server monitor" description = "Extra tags for API server monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "apiserver_message" { variable "apiserver_message" {
description = "Custom message for API server monitor" description = "Custom message for API server monitor"
type = "string" type = string
default = "" default = ""
} }
variable "apiserver_threshold_warning" { variable "apiserver_threshold_warning" {
description = "API server monitor (warning threshold)" description = "API server monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,20 +1,20 @@
resource "datadog_monitor" "apiserver" { resource "datadog_monitor" "apiserver" {
count = "${var.apiserver_enabled == "true" ? 1 : 0}" count = var.apiserver_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes API server does not respond" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes API server does not respond"
message = "${coalesce(var.apiserver_message, var.message)}" message = coalesce(var.apiserver_message, var.message)
type = "service check" type = "service check"
query = <<EOQ query = <<EOQ
"kube_apiserver_controlplane.up"${module.filter-tags.service_check}.last(6).count_by_status() "kube_apiserver_controlplane.up"${module.filter-tags.service_check}.last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.apiserver_threshold_warning}" warning = var.apiserver_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -23,5 +23,6 @@ resource "datadog_monitor" "apiserver" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.apiserver_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.apiserver_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "apiserver_id" { output "apiserver_id" {
description = "id for monitor apiserver" description = "id for monitor apiserver"
value = "${datadog_monitor.apiserver.*.id}" value = datadog_monitor.apiserver.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-ingress-vts" { module "datadog-monitors-caas-kubernetes-ingress-vts" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/ingress/vts?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/ingress/vts?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -30,14 +30,14 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| ingress\_4xx\_enabled | Flag to enable Ingress 4xx errors monitor | string | `"true"` | no | | ingress\_4xx\_enabled | Flag to enable Ingress 4xx errors monitor | string | `"true"` | no |
| ingress\_4xx\_extra\_tags | Extra tags for Ingress 4xx errors monitor | list | `[]` | no | | ingress\_4xx\_extra\_tags | Extra tags for Ingress 4xx errors monitor | list(string) | `[]` | no |
| ingress\_4xx\_message | Message sent when an alert is triggered | string | `""` | no | | ingress\_4xx\_message | Message sent when an alert is triggered | string | `""` | no |
| ingress\_4xx\_threshold\_critical | 4xx critical threshold in percentage | string | `"40"` | no | | ingress\_4xx\_threshold\_critical | 4xx critical threshold in percentage | string | `"40"` | no |
| ingress\_4xx\_threshold\_warning | 4xx warning threshold in percentage | string | `"20"` | no | | ingress\_4xx\_threshold\_warning | 4xx warning threshold in percentage | string | `"20"` | no |
| ingress\_4xx\_time\_aggregator | Monitor aggregator for Ingress 4xx errors [available values: min, max or avg] | string | `"min"` | no | | ingress\_4xx\_time\_aggregator | Monitor aggregator for Ingress 4xx errors [available values: min, max or avg] | string | `"min"` | no |
| ingress\_4xx\_timeframe | Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | ingress\_4xx\_timeframe | Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| ingress\_5xx\_enabled | Flag to enable Ingress 5xx errors monitor | string | `"true"` | no | | ingress\_5xx\_enabled | Flag to enable Ingress 5xx errors monitor | string | `"true"` | no |
| ingress\_5xx\_extra\_tags | Extra tags for Ingress 5xx errors monitor | list | `[]` | no | | ingress\_5xx\_extra\_tags | Extra tags for Ingress 5xx errors monitor | list(string) | `[]` | no |
| ingress\_5xx\_message | Message sent when an alert is triggered | string | `""` | no | | ingress\_5xx\_message | Message sent when an alert is triggered | string | `""` | no |
| ingress\_5xx\_threshold\_critical | 5xx critical threshold in percentage | string | `"20"` | no | | ingress\_5xx\_threshold\_critical | 5xx critical threshold in percentage | string | `"20"` | no |
| ingress\_5xx\_threshold\_warning | 5xx warning threshold in percentage | string | `"10"` | no | | ingress\_5xx\_threshold\_warning | 5xx warning threshold in percentage | string | `"10"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,13 +43,13 @@ variable "filter_tags_custom_excluded" {
variable "ingress_5xx_enabled" { variable "ingress_5xx_enabled" {
description = "Flag to enable Ingress 5xx errors monitor" description = "Flag to enable Ingress 5xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "ingress_5xx_extra_tags" { variable "ingress_5xx_extra_tags" {
description = "Extra tags for Ingress 5xx errors monitor" description = "Extra tags for Ingress 5xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
@ -60,37 +60,37 @@ variable "ingress_5xx_message" {
variable "ingress_5xx_time_aggregator" { variable "ingress_5xx_time_aggregator" {
description = "Monitor aggregator for Ingress 5xx errors [available values: min, max or avg]" description = "Monitor aggregator for Ingress 5xx errors [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "ingress_5xx_timeframe" { variable "ingress_5xx_timeframe" {
description = "Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "ingress_5xx_threshold_critical" { variable "ingress_5xx_threshold_critical" {
type = "string" type = string
default = "20" default = "20"
description = "5xx critical threshold in percentage" description = "5xx critical threshold in percentage"
} }
variable "ingress_5xx_threshold_warning" { variable "ingress_5xx_threshold_warning" {
type = "string" type = string
default = "10" default = "10"
description = "5xx warning threshold in percentage" description = "5xx warning threshold in percentage"
} }
variable "ingress_4xx_enabled" { variable "ingress_4xx_enabled" {
description = "Flag to enable Ingress 4xx errors monitor" description = "Flag to enable Ingress 4xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "ingress_4xx_extra_tags" { variable "ingress_4xx_extra_tags" {
description = "Extra tags for Ingress 4xx errors monitor" description = "Extra tags for Ingress 4xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
@ -101,24 +101,24 @@ variable "ingress_4xx_message" {
variable "ingress_4xx_time_aggregator" { variable "ingress_4xx_time_aggregator" {
description = "Monitor aggregator for Ingress 4xx errors [available values: min, max or avg]" description = "Monitor aggregator for Ingress 4xx errors [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "ingress_4xx_timeframe" { variable "ingress_4xx_timeframe" {
description = "Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Ingress 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "ingress_4xx_threshold_critical" { variable "ingress_4xx_threshold_critical" {
type = "string" type = string
default = "40" default = "40"
description = "4xx critical threshold in percentage" description = "4xx critical threshold in percentage"
} }
variable "ingress_4xx_threshold_warning" { variable "ingress_4xx_threshold_warning" {
type = "string" type = string
default = "20" default = "20"
description = "4xx warning threshold in percentage" description = "4xx warning threshold in percentage"
} }
@ -127,3 +127,4 @@ variable "artificial_requests_count" {
default = 5 default = 5
description = "Number of false requests used to mitigate false positive in case of low trafic" description = "Number of false requests used to mitigate false positive in case of low trafic"
} }

View File

@ -1,22 +1,22 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "ingress" resource = "ingress"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags_excluded = ["upstream:upstream-default-backend"] extra_tags_excluded = ["upstream:upstream-default-backend"]
} }
module "filter-tags-5xx" { module "filter-tags-5xx" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "ingress" resource = "ingress"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags = ["status_code:5xx"] extra_tags = ["status_code:5xx"]
extra_tags_excluded = ["upstream:upstream-default-backend"] extra_tags_excluded = ["upstream:upstream-default-backend"]
} }
@ -24,11 +24,12 @@ module "filter-tags-5xx" {
module "filter-tags-4xx" { module "filter-tags-4xx" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "ingress" resource = "ingress"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags = ["status_code:4xx"] extra_tags = ["status_code:4xx"]
extra_tags_excluded = ["upstream:upstream-default-backend"] extra_tags_excluded = ["upstream:upstream-default-backend"]
} }

View File

@ -1,25 +1,24 @@
resource "datadog_monitor" "nginx_ingress_too_many_5xx" { resource "datadog_monitor" "nginx_ingress_too_many_5xx" {
count = "${var.ingress_5xx_enabled == "true" ? 1 : 0}" count = var.ingress_5xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Nginx Ingress 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Nginx Ingress 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.ingress_5xx_message, var.message)}" message = coalesce(var.ingress_5xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): default( ${var.ingress_5xx_time_aggregator}(${var.ingress_5xx_timeframe}): default(
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate() / sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-5xx.query_alert} by {upstream,ingress_class}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}) (sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
* 100, 0) > ${var.ingress_5xx_threshold_critical} * 100, 0) > ${var.ingress_5xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.ingress_5xx_threshold_warning
thresholds { critical = var.ingress_5xx_threshold_critical
warning = "${var.ingress_5xx_threshold_warning}"
critical = "${var.ingress_5xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
@ -27,31 +26,30 @@ resource "datadog_monitor" "nginx_ingress_too_many_5xx" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:prometheus", "resource:nginx-ingress-controller", "team:claranet", "created-by:terraform", "${var.ingress_5xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:prometheus", "resource:nginx-ingress-controller", "team:claranet", "created-by:terraform"], var.ingress_5xx_extra_tags)
} }
resource "datadog_monitor" "nginx_ingress_too_many_4xx" { resource "datadog_monitor" "nginx_ingress_too_many_4xx" {
count = "${var.ingress_4xx_enabled == "true" ? 1 : 0}" count = var.ingress_4xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Nginx Ingress 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Nginx Ingress 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.ingress_4xx_message, var.message)}" message = coalesce(var.ingress_4xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): default( ${var.ingress_4xx_time_aggregator}(${var.ingress_4xx_timeframe}): default(
sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate() / sum:nginx_ingress.nginx_upstream_responses_total${module.filter-tags-4xx.query_alert} by {upstream,ingress_class}.as_rate() /
(sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count}) (sum:nginx_ingress.nginx_upstream_requests_total${module.filter-tags.query_alert} by {upstream,ingress_class}.as_rate() + ${var.artificial_requests_count})
* 100, 0) > ${var.ingress_4xx_threshold_critical} * 100, 0) > ${var.ingress_4xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.ingress_4xx_threshold_warning
thresholds { critical = var.ingress_4xx_threshold_critical
warning = "${var.ingress_4xx_threshold_warning}"
critical = "${var.ingress_4xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
@ -59,5 +57,6 @@ resource "datadog_monitor" "nginx_ingress_too_many_4xx" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:prometheus", "resource:nginx-ingress-controller", "team:claranet", "created-by:terraform", "${var.ingress_4xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:prometheus", "resource:nginx-ingress-controller", "team:claranet", "created-by:terraform"], var.ingress_4xx_extra_tags)
} }

View File

@ -1,9 +1,10 @@
output "nginx_ingress_too_many_5xx_id" { output "nginx_ingress_too_many_5xx_id" {
description = "id for monitor nginx_ingress_too_many_5xx" description = "id for monitor nginx_ingress_too_many_5xx"
value = "${datadog_monitor.nginx_ingress_too_many_5xx.*.id}" value = datadog_monitor.nginx_ingress_too_many_5xx.*.id
} }
output "nginx_ingress_too_many_4xx_id" { output "nginx_ingress_too_many_4xx_id" {
description = "id for monitor nginx_ingress_too_many_4xx" description = "id for monitor nginx_ingress_too_many_4xx"
value = "${datadog_monitor.nginx_ingress_too_many_4xx.*.id}" value = datadog_monitor.nginx_ingress_too_many_4xx.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-node" { module "datadog-monitors-caas-kubernetes-node" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/node?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/node?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -32,11 +32,11 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| disk\_out\_enabled | Flag to enable Out of disk monitor | string | `"true"` | no | | disk\_out\_enabled | Flag to enable Out of disk monitor | string | `"true"` | no |
| disk\_out\_extra\_tags | Extra tags for Out of disk monitor | list | `[]` | no | | disk\_out\_extra\_tags | Extra tags for Out of disk monitor | list(string) | `[]` | no |
| disk\_out\_message | Custom message for Out of disk monitor | string | `""` | no | | disk\_out\_message | Custom message for Out of disk monitor | string | `""` | no |
| disk\_out\_threshold\_warning | Out of disk monitor (warning threshold) | string | `"3"` | no | | disk\_out\_threshold\_warning | Out of disk monitor (warning threshold) | string | `"3"` | no |
| disk\_pressure\_enabled | Flag to enable Disk pressure monitor | string | `"true"` | no | | disk\_pressure\_enabled | Flag to enable Disk pressure monitor | string | `"true"` | no |
| disk\_pressure\_extra\_tags | Extra tags for Disk pressure monitor | list | `[]` | no | | disk\_pressure\_extra\_tags | Extra tags for Disk pressure monitor | list(string) | `[]` | no |
| disk\_pressure\_message | Custom message for Disk pressure monitor | string | `""` | no | | disk\_pressure\_message | Custom message for Disk pressure monitor | string | `""` | no |
| disk\_pressure\_threshold\_warning | Disk pressure monitor (warning threshold) | string | `"3"` | no | | disk\_pressure\_threshold\_warning | Disk pressure monitor (warning threshold) | string | `"3"` | no |
| environment | Architecture environment | string | n/a | yes | | environment | Architecture environment | string | n/a | yes |
@ -45,44 +45,44 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| kubelet\_ping\_enabled | Flag to enable Kubelet ping monitor | string | `"true"` | no | | kubelet\_ping\_enabled | Flag to enable Kubelet ping monitor | string | `"true"` | no |
| kubelet\_ping\_extra\_tags | Extra tags for Kubelet ping monitor | list | `[]` | no | | kubelet\_ping\_extra\_tags | Extra tags for Kubelet ping monitor | list(string) | `[]` | no |
| kubelet\_ping\_message | Custom message for Kubelet ping monitor | string | `""` | no | | kubelet\_ping\_message | Custom message for Kubelet ping monitor | string | `""` | no |
| kubelet\_ping\_threshold\_warning | Kubelet ping monitor (warning threshold) | string | `"3"` | no | | kubelet\_ping\_threshold\_warning | Kubelet ping monitor (warning threshold) | string | `"3"` | no |
| kubelet\_syncloop\_enabled | Flag to enable Kubelet sync loop monitor | string | `"true"` | no | | kubelet\_syncloop\_enabled | Flag to enable Kubelet sync loop monitor | string | `"true"` | no |
| kubelet\_syncloop\_extra\_tags | Extra tags for Kubelet sync loop monitor | list | `[]` | no | | kubelet\_syncloop\_extra\_tags | Extra tags for Kubelet sync loop monitor | list(string) | `[]` | no |
| kubelet\_syncloop\_message | Custom message for Kubelet sync loop monitor | string | `""` | no | | kubelet\_syncloop\_message | Custom message for Kubelet sync loop monitor | string | `""` | no |
| kubelet\_syncloop\_threshold\_warning | Kubelet sync loop monitor (warning threshold) | string | `"3"` | no | | kubelet\_syncloop\_threshold\_warning | Kubelet sync loop monitor (warning threshold) | string | `"3"` | no |
| memory\_pressure\_enabled | Flag to enable Memory pressure monitor | string | `"true"` | no | | memory\_pressure\_enabled | Flag to enable Memory pressure monitor | string | `"true"` | no |
| memory\_pressure\_extra\_tags | Extra tags for Memory pressure monitor | list | `[]` | no | | memory\_pressure\_extra\_tags | Extra tags for Memory pressure monitor | list(string) | `[]` | no |
| memory\_pressure\_message | Custom message for Memory pressure monitor | string | `""` | no | | memory\_pressure\_message | Custom message for Memory pressure monitor | string | `""` | no |
| memory\_pressure\_threshold\_warning | Memory pressure monitor (warning threshold) | string | `"3"` | no | | memory\_pressure\_threshold\_warning | Memory pressure monitor (warning threshold) | string | `"3"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes | | message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| node\_unschedulable\_enabled | Flag to enable node unschedulable monitor | string | `"true"` | no | | node\_unschedulable\_enabled | Flag to enable node unschedulable monitor | string | `"true"` | no |
| node\_unschedulable\_extra\_tags | Extra tags for node unschedulable monitor | list | `[]` | no | | node\_unschedulable\_extra\_tags | Extra tags for node unschedulable monitor | list(string) | `[]` | no |
| node\_unschedulable\_message | Custom message for node unschedulable monitor | string | `""` | no | | node\_unschedulable\_message | Custom message for node unschedulable monitor | string | `""` | no |
| node\_unschedulable\_time\_aggregator | Monitor aggregator for node unschedulable [available values: min, max or avg] | string | `"min"` | no | | node\_unschedulable\_time\_aggregator | Monitor aggregator for node unschedulable [available values: min, max or avg] | string | `"min"` | no |
| node\_unschedulable\_timeframe | Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no | | node\_unschedulable\_timeframe | Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| ready\_enabled | Flag to enable Node ready monitor | string | `"true"` | no | | ready\_enabled | Flag to enable Node ready monitor | string | `"true"` | no |
| ready\_extra\_tags | Extra tags for Node ready monitor | list | `[]` | no | | ready\_extra\_tags | Extra tags for Node ready monitor | list(string) | `[]` | no |
| ready\_message | Custom message for Node ready monitor | string | `""` | no | | ready\_message | Custom message for Node ready monitor | string | `""` | no |
| ready\_threshold\_warning | Node ready monitor (warning threshold) | string | `"3"` | no | | ready\_threshold\_warning | Node ready monitor (warning threshold) | string | `"3"` | no |
| unregister\_net\_device\_enabled | Flag to enable Unregister net device monitor | string | `"true"` | no | | unregister\_net\_device\_enabled | Flag to enable Unregister net device monitor | string | `"true"` | no |
| unregister\_net\_device\_extra\_tags | Extra tags for Unregister net device monitor | list | `[]` | no | | unregister\_net\_device\_extra\_tags | Extra tags for Unregister net device monitor | list(string) | `[]` | no |
| unregister\_net\_device\_message | Custom message for Unregister net device monitor | string | `""` | no | | unregister\_net\_device\_message | Custom message for Unregister net device monitor | string | `""` | no |
| unregister\_net\_device\_threshold\_critical | Unregister net device critical threshold | string | `"3"` | no | | unregister\_net\_device\_threshold\_critical | Unregister net device critical threshold | string | `"3"` | no |
| unregister\_net\_device\_time\_aggregator | Monitor aggregator for Unregister net device [available values: min, max or avg] | string | `"min"` | no | | unregister\_net\_device\_time\_aggregator | Monitor aggregator for Unregister net device [available values: min, max or avg] | string | `"min"` | no |
| unregister\_net\_device\_timeframe | Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"15m"` | no | | unregister\_net\_device\_timeframe | Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"15m"` | no |
| volume\_inodes\_enabled | Flag to enable Volume inodes monitor | string | `"true"` | no | | volume\_inodes\_enabled | Flag to enable Volume inodes monitor | string | `"true"` | no |
| volume\_inodes\_extra\_tags | Extra tags for Volume inodes monitor | list | `[]` | no | | volume\_inodes\_extra\_tags | Extra tags for Volume inodes monitor | list(string) | `[]` | no |
| volume\_inodes\_message | Custom message for Volume inodes monitor | string | `""` | no | | volume\_inodes\_message | Custom message for Volume inodes monitor | string | `""` | no |
| volume\_inodes\_threshold\_critical | Volume inodes critical threshold | string | `"95"` | no | | volume\_inodes\_threshold\_critical | Volume inodes critical threshold | string | `"95"` | no |
| volume\_inodes\_threshold\_warning | Volume inodes warning threshold | string | `"90"` | no | | volume\_inodes\_threshold\_warning | Volume inodes warning threshold | string | `"90"` | no |
| volume\_inodes\_time\_aggregator | Monitor aggregator for Volume inodes [available values: min, max or avg] | string | `"min"` | no | | volume\_inodes\_time\_aggregator | Monitor aggregator for Volume inodes [available values: min, max or avg] | string | `"min"` | no |
| volume\_inodes\_timeframe | Monitor timeframe for Volume inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | volume\_inodes\_timeframe | Monitor timeframe for Volume inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| volume\_space\_enabled | Flag to enable Volume space monitor | string | `"true"` | no | | volume\_space\_enabled | Flag to enable Volume space monitor | string | `"true"` | no |
| volume\_space\_extra\_tags | Extra tags for Volume space monitor | list | `[]` | no | | volume\_space\_extra\_tags | Extra tags for Volume space monitor | list(string) | `[]` | no |
| volume\_space\_message | Custom message for Volume space monitor | string | `""` | no | | volume\_space\_message | Custom message for Volume space monitor | string | `""` | no |
| volume\_space\_threshold\_critical | Volume space critical threshold | string | `"95"` | no | | volume\_space\_threshold\_critical | Volume space critical threshold | string | `"95"` | no |
| volume\_space\_threshold\_warning | Volume space warning threshold | string | `"90"` | no | | volume\_space\_threshold\_warning | Volume space warning threshold | string | `"90"` | no |

View File

@ -42,175 +42,175 @@ variable "prefix_slug" {
variable "disk_pressure_enabled" { variable "disk_pressure_enabled" {
description = "Flag to enable Disk pressure monitor" description = "Flag to enable Disk pressure monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "disk_pressure_extra_tags" { variable "disk_pressure_extra_tags" {
description = "Extra tags for Disk pressure monitor" description = "Extra tags for Disk pressure monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "disk_pressure_message" { variable "disk_pressure_message" {
description = "Custom message for Disk pressure monitor" description = "Custom message for Disk pressure monitor"
type = "string" type = string
default = "" default = ""
} }
variable "disk_pressure_threshold_warning" { variable "disk_pressure_threshold_warning" {
description = "Disk pressure monitor (warning threshold)" description = "Disk pressure monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "disk_out_enabled" { variable "disk_out_enabled" {
description = "Flag to enable Out of disk monitor" description = "Flag to enable Out of disk monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "disk_out_extra_tags" { variable "disk_out_extra_tags" {
description = "Extra tags for Out of disk monitor" description = "Extra tags for Out of disk monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "disk_out_message" { variable "disk_out_message" {
description = "Custom message for Out of disk monitor" description = "Custom message for Out of disk monitor"
type = "string" type = string
default = "" default = ""
} }
variable "disk_out_threshold_warning" { variable "disk_out_threshold_warning" {
description = "Out of disk monitor (warning threshold)" description = "Out of disk monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "memory_pressure_enabled" { variable "memory_pressure_enabled" {
description = "Flag to enable Memory pressure monitor" description = "Flag to enable Memory pressure monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "memory_pressure_extra_tags" { variable "memory_pressure_extra_tags" {
description = "Extra tags for Memory pressure monitor" description = "Extra tags for Memory pressure monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "memory_pressure_message" { variable "memory_pressure_message" {
description = "Custom message for Memory pressure monitor" description = "Custom message for Memory pressure monitor"
type = "string" type = string
default = "" default = ""
} }
variable "memory_pressure_threshold_warning" { variable "memory_pressure_threshold_warning" {
description = "Memory pressure monitor (warning threshold)" description = "Memory pressure monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "ready_enabled" { variable "ready_enabled" {
description = "Flag to enable Node ready monitor" description = "Flag to enable Node ready monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "ready_extra_tags" { variable "ready_extra_tags" {
description = "Extra tags for Node ready monitor" description = "Extra tags for Node ready monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "ready_message" { variable "ready_message" {
description = "Custom message for Node ready monitor" description = "Custom message for Node ready monitor"
type = "string" type = string
default = "" default = ""
} }
variable "ready_threshold_warning" { variable "ready_threshold_warning" {
description = "Node ready monitor (warning threshold)" description = "Node ready monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "kubelet_ping_enabled" { variable "kubelet_ping_enabled" {
description = "Flag to enable Kubelet ping monitor" description = "Flag to enable Kubelet ping monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "kubelet_ping_extra_tags" { variable "kubelet_ping_extra_tags" {
description = "Extra tags for Kubelet ping monitor" description = "Extra tags for Kubelet ping monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "kubelet_ping_message" { variable "kubelet_ping_message" {
description = "Custom message for Kubelet ping monitor" description = "Custom message for Kubelet ping monitor"
type = "string" type = string
default = "" default = ""
} }
variable "kubelet_ping_threshold_warning" { variable "kubelet_ping_threshold_warning" {
description = "Kubelet ping monitor (warning threshold)" description = "Kubelet ping monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "kubelet_syncloop_enabled" { variable "kubelet_syncloop_enabled" {
description = "Flag to enable Kubelet sync loop monitor" description = "Flag to enable Kubelet sync loop monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "kubelet_syncloop_extra_tags" { variable "kubelet_syncloop_extra_tags" {
description = "Extra tags for Kubelet sync loop monitor" description = "Extra tags for Kubelet sync loop monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "kubelet_syncloop_message" { variable "kubelet_syncloop_message" {
description = "Custom message for Kubelet sync loop monitor" description = "Custom message for Kubelet sync loop monitor"
type = "string" type = string
default = "" default = ""
} }
variable "kubelet_syncloop_threshold_warning" { variable "kubelet_syncloop_threshold_warning" {
description = "Kubelet sync loop monitor (warning threshold)" description = "Kubelet sync loop monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "unregister_net_device_enabled" { variable "unregister_net_device_enabled" {
description = "Flag to enable Unregister net device monitor" description = "Flag to enable Unregister net device monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "unregister_net_device_extra_tags" { variable "unregister_net_device_extra_tags" {
description = "Extra tags for Unregister net device monitor" description = "Extra tags for Unregister net device monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "unregister_net_device_message" { variable "unregister_net_device_message" {
description = "Custom message for Unregister net device monitor" description = "Custom message for Unregister net device monitor"
type = "string" type = string
default = "" default = ""
} }
variable "unregister_net_device_time_aggregator" { variable "unregister_net_device_time_aggregator" {
description = "Monitor aggregator for Unregister net device [available values: min, max or avg]" description = "Monitor aggregator for Unregister net device [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "unregister_net_device_timeframe" { variable "unregister_net_device_timeframe" {
description = "Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "15m" default = "15m"
} }
@ -221,61 +221,61 @@ variable "unregister_net_device_threshold_critical" {
variable "node_unschedulable_enabled" { variable "node_unschedulable_enabled" {
description = "Flag to enable node unschedulable monitor" description = "Flag to enable node unschedulable monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "node_unschedulable_extra_tags" { variable "node_unschedulable_extra_tags" {
description = "Extra tags for node unschedulable monitor" description = "Extra tags for node unschedulable monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "node_unschedulable_message" { variable "node_unschedulable_message" {
description = "Custom message for node unschedulable monitor" description = "Custom message for node unschedulable monitor"
type = "string" type = string
default = "" default = ""
} }
variable "node_unschedulable_time_aggregator" { variable "node_unschedulable_time_aggregator" {
description = "Monitor aggregator for node unschedulable [available values: min, max or avg]" description = "Monitor aggregator for node unschedulable [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "node_unschedulable_timeframe" { variable "node_unschedulable_timeframe" {
description = "Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_1h" default = "last_1h"
} }
variable "volume_space_enabled" { variable "volume_space_enabled" {
description = "Flag to enable Volume space monitor" description = "Flag to enable Volume space monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "volume_space_extra_tags" { variable "volume_space_extra_tags" {
description = "Extra tags for Volume space monitor" description = "Extra tags for Volume space monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "volume_space_message" { variable "volume_space_message" {
description = "Custom message for Volume space monitor" description = "Custom message for Volume space monitor"
type = "string" type = string
default = "" default = ""
} }
variable "volume_space_time_aggregator" { variable "volume_space_time_aggregator" {
description = "Monitor aggregator for Volume space [available values: min, max or avg]" description = "Monitor aggregator for Volume space [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "volume_space_timeframe" { variable "volume_space_timeframe" {
description = "Monitor timeframe for Volume space [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Volume space [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -291,31 +291,31 @@ variable "volume_space_threshold_warning" {
variable "volume_inodes_enabled" { variable "volume_inodes_enabled" {
description = "Flag to enable Volume inodes monitor" description = "Flag to enable Volume inodes monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "volume_inodes_extra_tags" { variable "volume_inodes_extra_tags" {
description = "Extra tags for Volume inodes monitor" description = "Extra tags for Volume inodes monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "volume_inodes_message" { variable "volume_inodes_message" {
description = "Custom message for Volume inodes monitor" description = "Custom message for Volume inodes monitor"
type = "string" type = string
default = "" default = ""
} }
variable "volume_inodes_time_aggregator" { variable "volume_inodes_time_aggregator" {
description = "Monitor aggregator for Volume inodes [available values: min, max or avg]" description = "Monitor aggregator for Volume inodes [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "volume_inodes_timeframe" { variable "volume_inodes_timeframe" {
description = "Monitor timeframe for Volume inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Volume inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -328,3 +328,4 @@ variable "volume_inodes_threshold_warning" {
default = 90 default = 90
description = "Volume inodes warning threshold" description = "Volume inodes warning threshold"
} }

View File

@ -1,20 +1,21 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }
module "filter-tags-unschedulable" { module "filter-tags-unschedulable" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags = ["status:unschedulable"] extra_tags = ["status:unschedulable"]
} }

View File

@ -1,20 +1,19 @@
resource "datadog_monitor" "disk_pressure" { resource "datadog_monitor" "disk_pressure" {
count = "${var.disk_pressure_enabled == "true" ? 1 : 0}" count = var.disk_pressure_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Disk pressure" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Disk pressure"
message = "${coalesce(var.disk_pressure_message, var.message)}" message = coalesce(var.disk_pressure_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status() "kubernetes_state.node.disk_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.disk_pressure_threshold_warning}" warning = var.disk_pressure_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -23,26 +22,25 @@ resource "datadog_monitor" "disk_pressure" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.disk_pressure_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.disk_pressure_extra_tags)
} }
resource "datadog_monitor" "disk_out" { resource "datadog_monitor" "disk_out" {
count = "${var.disk_out_enabled == "true" ? 1 : 0}" count = var.disk_out_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Out of disk" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Out of disk"
message = "${coalesce(var.disk_out_message, var.message)}" message = coalesce(var.disk_out_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status() "kubernetes_state.node.out_of_disk"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.disk_out_threshold_warning}" warning = var.disk_out_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -51,26 +49,25 @@ resource "datadog_monitor" "disk_out" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.disk_out_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.disk_out_extra_tags)
} }
resource "datadog_monitor" "memory_pressure" { resource "datadog_monitor" "memory_pressure" {
count = "${var.memory_pressure_enabled == "true" ? 1 : 0}" count = var.memory_pressure_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Memory pressure" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Memory pressure"
message = "${coalesce(var.memory_pressure_message, var.message)}" message = coalesce(var.memory_pressure_message, var.message)
type = "service check"
type = "service check" query = <<EOQ
query = <<EOQ
"kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status() "kubernetes_state.node.memory_pressure"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.memory_pressure_threshold_warning}" warning = var.memory_pressure_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -79,26 +76,25 @@ resource "datadog_monitor" "memory_pressure" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.memory_pressure_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.memory_pressure_extra_tags)
} }
resource "datadog_monitor" "ready" { resource "datadog_monitor" "ready" {
count = "${var.ready_enabled == "true" ? 1 : 0}" count = var.ready_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node not ready" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node not ready"
message = "${coalesce(var.ready_message, var.message)}" message = coalesce(var.ready_message, var.message)
type = "service check"
type = "service check" query = <<EOQ
query = <<EOQ
"kubernetes_state.node.ready"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status() "kubernetes_state.node.ready"${module.filter-tags.service_check}.by("kubernetescluster","node").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.ready_threshold_warning}" warning = var.ready_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -107,26 +103,25 @@ resource "datadog_monitor" "ready" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.ready_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.ready_extra_tags)
} }
resource "datadog_monitor" "kubelet_ping" { resource "datadog_monitor" "kubelet_ping" {
count = "${var.kubelet_ping_enabled == "true" ? 1 : 0}" count = var.kubelet_ping_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Kubelet API does not respond" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Kubelet API does not respond"
message = "${coalesce(var.kubelet_ping_message, var.message)}" message = coalesce(var.kubelet_ping_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes.kubelet.check.ping"${module.filter-tags.service_check}.by("kubernetescluster","name").last(6).count_by_status() "kubernetes.kubelet.check.ping"${module.filter-tags.service_check}.by("kubernetescluster","name").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.kubelet_ping_threshold_warning}" warning = var.kubelet_ping_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -135,26 +130,25 @@ resource "datadog_monitor" "kubelet_ping" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.kubelet_ping_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.kubelet_ping_extra_tags)
} }
resource "datadog_monitor" "kubelet_syncloop" { resource "datadog_monitor" "kubelet_syncloop" {
count = "${var.kubelet_syncloop_enabled == "true" ? 1 : 0}" count = var.kubelet_syncloop_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Kubelet sync loop that updates containers does not work" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Kubelet sync loop that updates containers does not work"
message = "${coalesce(var.kubelet_syncloop_message, var.message)}" message = coalesce(var.kubelet_syncloop_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes.kubelet.check.syncloop"${module.filter-tags.service_check}.by("kubernetescluster","name").last(6).count_by_status() "kubernetes.kubelet.check.syncloop"${module.filter-tags.service_check}.by("kubernetescluster","name").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.kubelet_syncloop_threshold_warning}" warning = var.kubelet_syncloop_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -163,21 +157,20 @@ resource "datadog_monitor" "kubelet_syncloop" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.kubelet_syncloop_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.kubelet_syncloop_extra_tags)
} }
resource "datadog_monitor" "unregister_net_device" { resource "datadog_monitor" "unregister_net_device" {
count = "${var.unregister_net_device_enabled == "true" ? 1 : 0}" count = var.unregister_net_device_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Frequent unregister net device" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Frequent unregister net device"
message = coalesce(var.unregister_net_device_message, var.message)
type = "event alert" type = "event alert"
message = "${coalesce(var.unregister_net_device_message, var.message)}"
query = <<EOQ query = <<EOQ
events('sources:kubernetes priority:all ${module.filter-tags.event_alert} \"UnregisterNetDevice\"').rollup('count').last('${var.unregister_net_device_timeframe}') > ${var.unregister_net_device_threshold_critical} events('sources:kubernetes priority:all ${module.filter-tags.event_alert} \"UnregisterNetDevice\"').rollup('count').last('${var.unregister_net_device_timeframe}') > ${var.unregister_net_device_threshold_critical}
EOQ EOQ
new_host_delay = "${var.new_host_delay}"
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -185,28 +178,27 @@ resource "datadog_monitor" "unregister_net_device" {
include_tags = true include_tags = true
locked = false locked = false
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.unregister_net_device_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.unregister_net_device_extra_tags)
} }
resource "datadog_monitor" "node_unschedulable" { resource "datadog_monitor" "node_unschedulable" {
count = "${var.node_unschedulable_enabled == "true" ? 1 : 0}" count = var.node_unschedulable_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node unschedulable" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node unschedulable"
message = coalesce(var.node_unschedulable_message, var.message)
type = "metric alert" type = "metric alert"
message = "${coalesce(var.node_unschedulable_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}): ${var.node_unschedulable_time_aggregator}(${var.node_unschedulable_timeframe}):
sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {kubernetescluster,node} sum:kubernetes_state.node.status${module.filter-tags-unschedulable.query_alert} by {kubernetescluster,node}
> 0 > 0
EOQ EOQ
thresholds { thresholds = {
critical = 0 critical = 0
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -215,30 +207,29 @@ resource "datadog_monitor" "node_unschedulable" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.node_unschedulable_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.node_unschedulable_extra_tags)
} }
resource "datadog_monitor" "volume_space" { resource "datadog_monitor" "volume_space" {
count = "${var.volume_space_enabled == "true" ? 1 : 0}" count = var.volume_space_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node volume space usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node volume space usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.volume_space_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.volume_space_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.volume_space_time_aggregator}(${var.volume_space_timeframe}): ${var.volume_space_time_aggregator}(${var.volume_space_timeframe}):
avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} / avg:kubernetes.kubelet.volume.stats.used_bytes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} /
avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} avg:kubernetes.kubelet.volume.stats.capacity_bytes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim}
* 100 > ${var.volume_space_threshold_critical} * 100 > ${var.volume_space_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.volume_space_threshold_critical}" critical = var.volume_space_threshold_critical
warning = "${var.volume_space_threshold_warning}" warning = var.volume_space_threshold_warning
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -247,30 +238,29 @@ resource "datadog_monitor" "volume_space" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.volume_space_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.volume_space_extra_tags)
} }
resource "datadog_monitor" "volume_inodes" { resource "datadog_monitor" "volume_inodes" {
count = "${var.volume_inodes_enabled == "true" ? 1 : 0}" count = var.volume_inodes_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node volume inodes usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node volume inodes usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.volume_inodes_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.volume_inodes_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.volume_inodes_time_aggregator}(${var.volume_inodes_timeframe}): ${var.volume_inodes_time_aggregator}(${var.volume_inodes_timeframe}):
avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} / avg:kubernetes.kubelet.volume.stats.inodes_used${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} /
avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim} avg:kubernetes.kubelet.volume.stats.inodes${module.filter-tags.query_alert} by {kubernetescluster,name,persistentvolumeclaim}
* 100 > ${var.volume_inodes_threshold_critical} * 100 > ${var.volume_inodes_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.volume_inodes_threshold_critical}" critical = var.volume_inodes_threshold_critical
warning = "${var.volume_inodes_threshold_warning}" warning = var.volume_inodes_threshold_warning
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -279,5 +269,6 @@ resource "datadog_monitor" "volume_inodes" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.volume_inodes_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform"], var.volume_inodes_extra_tags)
} }

View File

@ -1,49 +1,50 @@
output "disk_pressure_id" { output "disk_pressure_id" {
description = "id for monitor disk_pressure" description = "id for monitor disk_pressure"
value = "${datadog_monitor.disk_pressure.*.id}" value = datadog_monitor.disk_pressure.*.id
} }
output "disk_out_id" { output "disk_out_id" {
description = "id for monitor disk_out" description = "id for monitor disk_out"
value = "${datadog_monitor.disk_out.*.id}" value = datadog_monitor.disk_out.*.id
} }
output "memory_pressure_id" { output "memory_pressure_id" {
description = "id for monitor memory_pressure" description = "id for monitor memory_pressure"
value = "${datadog_monitor.memory_pressure.*.id}" value = datadog_monitor.memory_pressure.*.id
} }
output "ready_id" { output "ready_id" {
description = "id for monitor ready" description = "id for monitor ready"
value = "${datadog_monitor.ready.*.id}" value = datadog_monitor.ready.*.id
} }
output "kubelet_ping_id" { output "kubelet_ping_id" {
description = "id for monitor kubelet_ping" description = "id for monitor kubelet_ping"
value = "${datadog_monitor.kubelet_ping.*.id}" value = datadog_monitor.kubelet_ping.*.id
} }
output "kubelet_syncloop_id" { output "kubelet_syncloop_id" {
description = "id for monitor kubelet_syncloop" description = "id for monitor kubelet_syncloop"
value = "${datadog_monitor.kubelet_syncloop.*.id}" value = datadog_monitor.kubelet_syncloop.*.id
} }
output "unregister_net_device_id" { output "unregister_net_device_id" {
description = "id for monitor unregister_net_device" description = "id for monitor unregister_net_device"
value = "${datadog_monitor.unregister_net_device.*.id}" value = datadog_monitor.unregister_net_device.*.id
} }
output "node_unschedulable_id" { output "node_unschedulable_id" {
description = "id for monitor node_unschedulable" description = "id for monitor node_unschedulable"
value = "${datadog_monitor.node_unschedulable.*.id}" value = datadog_monitor.node_unschedulable.*.id
} }
output "volume_space_id" { output "volume_space_id" {
description = "id for monitor volume_space" description = "id for monitor volume_space"
value = "${datadog_monitor.volume_space.*.id}" value = datadog_monitor.volume_space.*.id
} }
output "volume_inodes_id" { output "volume_inodes_id" {
description = "id for monitor volume_inodes" description = "id for monitor volume_inodes"
value = "${datadog_monitor.volume_inodes.*.id}" value = datadog_monitor.volume_inodes.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-pod" { module "datadog-monitors-caas-kubernetes-pod" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/pod?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/pod?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -25,7 +25,7 @@ Creates DataDog monitors with the following checks:
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes | | environment | Architecture environment | string | n/a | yes |
| error\_enabled | Flag to enable Pod errors monitor | string | `"true"` | no | | error\_enabled | Flag to enable Pod errors monitor | string | `"true"` | no |
| error\_extra\_tags | Extra tags for Pod errors monitor | list | `[]` | no | | error\_extra\_tags | Extra tags for Pod errors monitor | list(string) | `[]` | no |
| error\_message | Custom message for Pod errors monitor | string | `""` | no | | error\_message | Custom message for Pod errors monitor | string | `""` | no |
| error\_threshold\_critical | error critical threshold | string | `"0.5"` | no | | error\_threshold\_critical | error critical threshold | string | `"0.5"` | no |
| error\_threshold\_warning | error warning threshold | string | `"0"` | no | | error\_threshold\_warning | error warning threshold | string | `"0"` | no |
@ -38,7 +38,7 @@ Creates DataDog monitors with the following checks:
| message | Message sent when a monitor is triggered | string | n/a | yes | | message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| pod\_phase\_status\_enabled | Flag to enable Pod phase status monitor | string | `"true"` | no | | pod\_phase\_status\_enabled | Flag to enable Pod phase status monitor | string | `"true"` | no |
| pod\_phase\_status\_extra\_tags | Extra tags for Pod phase status monitor | list | `[]` | no | | pod\_phase\_status\_extra\_tags | Extra tags for Pod phase status monitor | list(string) | `[]` | no |
| pod\_phase\_status\_message | Custom message for Pod phase status monitor | string | `""` | no | | pod\_phase\_status\_message | Custom message for Pod phase status monitor | string | `""` | no |
| pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no | | pod\_phase\_status\_time\_aggregator | Monitor aggregator for Pod phase status [available values: min, max or avg] | string | `"max"` | no |
| pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | pod\_phase\_status\_timeframe | Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |

View File

@ -42,61 +42,61 @@ variable "prefix_slug" {
variable "pod_phase_status_enabled" { variable "pod_phase_status_enabled" {
description = "Flag to enable Pod phase status monitor" description = "Flag to enable Pod phase status monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "pod_phase_status_extra_tags" { variable "pod_phase_status_extra_tags" {
description = "Extra tags for Pod phase status monitor" description = "Extra tags for Pod phase status monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "pod_phase_status_message" { variable "pod_phase_status_message" {
description = "Custom message for Pod phase status monitor" description = "Custom message for Pod phase status monitor"
type = "string" type = string
default = "" default = ""
} }
variable "pod_phase_status_time_aggregator" { variable "pod_phase_status_time_aggregator" {
description = "Monitor aggregator for Pod phase status [available values: min, max or avg]" description = "Monitor aggregator for Pod phase status [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "pod_phase_status_timeframe" { variable "pod_phase_status_timeframe" {
description = "Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Pod phase status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "error_enabled" { variable "error_enabled" {
description = "Flag to enable Pod errors monitor" description = "Flag to enable Pod errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "error_extra_tags" { variable "error_extra_tags" {
description = "Extra tags for Pod errors monitor" description = "Extra tags for Pod errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "error_message" { variable "error_message" {
description = "Custom message for Pod errors monitor" description = "Custom message for Pod errors monitor"
type = "string" type = string
default = "" default = ""
} }
variable "error_time_aggregator" { variable "error_time_aggregator" {
description = "Monitor aggregator for Pod errors [available values: min, max or avg]" description = "Monitor aggregator for Pod errors [available values: min, max or avg]"
type = "string" type = string
default = "sum" default = "sum"
} }
variable "error_timeframe" { variable "error_timeframe" {
description = "Monitor timeframe for Pod errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Pod errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -109,3 +109,4 @@ variable "error_threshold_warning" {
default = 0 default = 0
description = "error warning threshold" description = "error warning threshold"
} }

View File

@ -1,31 +1,32 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }
module "filter-tags-phase" { module "filter-tags-phase" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags_excluded = ["phase:pending,phase:running,phase:succeeded,phase:unknown"] extra_tags_excluded = ["phase:pending,phase:running,phase:succeeded,phase:unknown"]
} }
module "filter-tags-nocontainercreating" { module "filter-tags-nocontainercreating" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
extra_tags_excluded = ["reason:containercreating"] extra_tags_excluded = ["reason:containercreating"]
} }

View File

@ -1,20 +1,20 @@
resource "datadog_monitor" "pod_phase_status" { resource "datadog_monitor" "pod_phase_status" {
count = "${var.pod_phase_status_enabled == "true" ? 1 : 0}" count = var.pod_phase_status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod phase status failed" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod phase status failed"
message = coalesce(var.pod_phase_status_message, var.message)
type = "metric alert" type = "metric alert"
message = "${coalesce(var.pod_phase_status_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.pod_phase_status_time_aggregator}(${var.pod_phase_status_timeframe}): ${var.pod_phase_status_time_aggregator}(${var.pod_phase_status_timeframe}):
sum:kubernetes_state.pod.status_phase${module.filter-tags-phase.query_alert} by {namespace} > 0 sum:kubernetes_state.pod.status_phase${module.filter-tags-phase.query_alert} by {namespace} > 0
EOQ EOQ
thresholds { thresholds = {
critical = 0 critical = 0
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
@ -24,29 +24,28 @@ resource "datadog_monitor" "pod_phase_status" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform", "${var.pod_phase_status_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.pod_phase_status_extra_tags)
} }
resource "datadog_monitor" "error" { resource "datadog_monitor" "error" {
count = "${var.error_enabled == "true" ? 1 : 0}" count = var.error_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod waiting errors" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod waiting errors"
message = coalesce(var.error_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.error_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.error_time_aggregator}(${var.error_timeframe}): ${var.error_time_aggregator}(${var.error_timeframe}):
sum:kubernetes_state.container.status_report.count.waiting${module.filter-tags-nocontainercreating.query_alert} by {namespace,pod,reason}.as_count() sum:kubernetes_state.container.status_report.count.waiting${module.filter-tags-nocontainercreating.query_alert} by {namespace,pod,reason}.as_count()
> ${var.error_threshold_critical} > ${var.error_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.error_threshold_critical}" critical = var.error_threshold_critical
warning = "${var.error_threshold_warning}" warning = var.error_threshold_warning
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -55,5 +54,6 @@ resource "datadog_monitor" "error" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform", "${var.error_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-pod", "team:claranet", "created-by:terraform"], var.error_extra_tags)
} }

View File

@ -1,9 +1,10 @@
output "pod_phase_status_id" { output "pod_phase_status_id" {
description = "id for monitor pod_phase_status" description = "id for monitor pod_phase_status"
value = "${datadog_monitor.pod_phase_status.*.id}" value = datadog_monitor.pod_phase_status.*.id
} }
output "error_id" { output "error_id" {
description = "id for monitor error" description = "id for monitor error"
value = "${datadog_monitor.error.*.id}" value = datadog_monitor.error.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-caas-kubernetes-workload" { module "datadog-monitors-caas-kubernetes-workload" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/workload?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/workload?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -27,7 +27,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cronjob\_enabled | Flag to enable Cronjob monitor | string | `"true"` | no | | cronjob\_enabled | Flag to enable Cronjob monitor | string | `"true"` | no |
| cronjob\_extra\_tags | Extra tags for Cronjob monitor | list | `[]` | no | | cronjob\_extra\_tags | Extra tags for Cronjob monitor | list(string) | `[]` | no |
| cronjob\_message | Custom message for Cronjob monitor | string | `""` | no | | cronjob\_message | Custom message for Cronjob monitor | string | `""` | no |
| cronjob\_threshold\_warning | Cronjob monitor (warning threshold) | string | `"3"` | no | | cronjob\_threshold\_warning | Cronjob monitor (warning threshold) | string | `"3"` | no |
| environment | Architecture environment | string | n/a | yes | | environment | Architecture environment | string | n/a | yes |
@ -36,26 +36,26 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| job\_enabled | Flag to enable Job monitor | string | `"true"` | no | | job\_enabled | Flag to enable Job monitor | string | `"true"` | no |
| job\_extra\_tags | Extra tags for Job monitor | list | `[]` | no | | job\_extra\_tags | Extra tags for Job monitor | list(string) | `[]` | no |
| job\_message | Custom message for Job monitor | string | `""` | no | | job\_message | Custom message for Job monitor | string | `""` | no |
| job\_threshold\_warning | Job monitor (warning threshold) | string | `"3"` | no | | job\_threshold\_warning | Job monitor (warning threshold) | string | `"3"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes | | message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| replica\_available\_enabled | Flag to enable Available replica monitor | string | `"true"` | no | | replica\_available\_enabled | Flag to enable Available replica monitor | string | `"true"` | no |
| replica\_available\_extra\_tags | Extra tags for Available replicamonitor | list | `[]` | no | | replica\_available\_extra\_tags | Extra tags for Available replicamonitor | list(string) | `[]` | no |
| replica\_available\_message | Custom message for Available replica monitor | string | `""` | no | | replica\_available\_message | Custom message for Available replica monitor | string | `""` | no |
| replica\_available\_threshold\_critical | Available replica critical threshold | string | `"1"` | no | | replica\_available\_threshold\_critical | Available replica critical threshold | string | `"1"` | no |
| replica\_available\_time\_aggregator | Monitor aggregator for Available replica [available values: min, max or avg] | string | `"max"` | no | | replica\_available\_time\_aggregator | Monitor aggregator for Available replica [available values: min, max or avg] | string | `"max"` | no |
| replica\_available\_timeframe | Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | replica\_available\_timeframe | Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| replica\_current\_enabled | Flag to enable Current replica monitor | string | `"true"` | no | | replica\_current\_enabled | Flag to enable Current replica monitor | string | `"true"` | no |
| replica\_current\_extra\_tags | Extra tags for Current replica monitor | list | `[]` | no | | replica\_current\_extra\_tags | Extra tags for Current replica monitor | list(string) | `[]` | no |
| replica\_current\_message | Custom message for Current replica monitor | string | `""` | no | | replica\_current\_message | Custom message for Current replica monitor | string | `""` | no |
| replica\_current\_threshold\_critical | Current replica critical threshold | string | `"1"` | no | | replica\_current\_threshold\_critical | Current replica critical threshold | string | `"1"` | no |
| replica\_current\_time\_aggregator | Monitor aggregator for Current replica [available values: min, max or avg] | string | `"max"` | no | | replica\_current\_time\_aggregator | Monitor aggregator for Current replica [available values: min, max or avg] | string | `"max"` | no |
| replica\_current\_timeframe | Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | replica\_current\_timeframe | Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| replica\_ready\_enabled | Flag to enable Ready replica monitor | string | `"true"` | no | | replica\_ready\_enabled | Flag to enable Ready replica monitor | string | `"true"` | no |
| replica\_ready\_extra\_tags | Extra tags for Ready replica monitor | list | `[]` | no | | replica\_ready\_extra\_tags | Extra tags for Ready replica monitor | list(string) | `[]` | no |
| replica\_ready\_message | Custom message for Ready replica monitor | string | `""` | no | | replica\_ready\_message | Custom message for Ready replica monitor | string | `""` | no |
| replica\_ready\_threshold\_critical | Ready replica critical threshold | string | `"1"` | no | | replica\_ready\_threshold\_critical | Ready replica critical threshold | string | `"1"` | no |
| replica\_ready\_time\_aggregator | Monitor aggregator for Ready replica [available values: min, max or avg] | string | `"max"` | no | | replica\_ready\_time\_aggregator | Monitor aggregator for Ready replica [available values: min, max or avg] | string | `"max"` | no |

View File

@ -42,79 +42,79 @@ variable "prefix_slug" {
variable "job_enabled" { variable "job_enabled" {
description = "Flag to enable Job monitor" description = "Flag to enable Job monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "job_extra_tags" { variable "job_extra_tags" {
description = "Extra tags for Job monitor" description = "Extra tags for Job monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "job_message" { variable "job_message" {
description = "Custom message for Job monitor" description = "Custom message for Job monitor"
type = "string" type = string
default = "" default = ""
} }
variable "job_threshold_warning" { variable "job_threshold_warning" {
description = "Job monitor (warning threshold)" description = "Job monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "cronjob_enabled" { variable "cronjob_enabled" {
description = "Flag to enable Cronjob monitor" description = "Flag to enable Cronjob monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cronjob_extra_tags" { variable "cronjob_extra_tags" {
description = "Extra tags for Cronjob monitor" description = "Extra tags for Cronjob monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cronjob_message" { variable "cronjob_message" {
description = "Custom message for Cronjob monitor" description = "Custom message for Cronjob monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cronjob_threshold_warning" { variable "cronjob_threshold_warning" {
description = "Cronjob monitor (warning threshold)" description = "Cronjob monitor (warning threshold)"
type = "string" type = string
default = 3 default = 3
} }
variable "replica_available_enabled" { variable "replica_available_enabled" {
description = "Flag to enable Available replica monitor" description = "Flag to enable Available replica monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "replica_available_extra_tags" { variable "replica_available_extra_tags" {
description = "Extra tags for Available replicamonitor" description = "Extra tags for Available replicamonitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "replica_available_message" { variable "replica_available_message" {
description = "Custom message for Available replica monitor" description = "Custom message for Available replica monitor"
type = "string" type = string
default = "" default = ""
} }
variable "replica_available_time_aggregator" { variable "replica_available_time_aggregator" {
description = "Monitor aggregator for Available replica [available values: min, max or avg]" description = "Monitor aggregator for Available replica [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "replica_available_timeframe" { variable "replica_available_timeframe" {
description = "Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -125,31 +125,31 @@ variable "replica_available_threshold_critical" {
variable "replica_ready_enabled" { variable "replica_ready_enabled" {
description = "Flag to enable Ready replica monitor" description = "Flag to enable Ready replica monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "replica_ready_extra_tags" { variable "replica_ready_extra_tags" {
description = "Extra tags for Ready replica monitor" description = "Extra tags for Ready replica monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "replica_ready_message" { variable "replica_ready_message" {
description = "Custom message for Ready replica monitor" description = "Custom message for Ready replica monitor"
type = "string" type = string
default = "" default = ""
} }
variable "replica_ready_time_aggregator" { variable "replica_ready_time_aggregator" {
description = "Monitor aggregator for Ready replica [available values: min, max or avg]" description = "Monitor aggregator for Ready replica [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "replica_ready_timeframe" { variable "replica_ready_timeframe" {
description = "Monitor timeframe for Ready replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Ready replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -160,31 +160,31 @@ variable "replica_ready_threshold_critical" {
variable "replica_current_enabled" { variable "replica_current_enabled" {
description = "Flag to enable Current replica monitor" description = "Flag to enable Current replica monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "replica_current_extra_tags" { variable "replica_current_extra_tags" {
description = "Extra tags for Current replica monitor" description = "Extra tags for Current replica monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "replica_current_message" { variable "replica_current_message" {
description = "Custom message for Current replica monitor" description = "Custom message for Current replica monitor"
type = "string" type = string
default = "" default = ""
} }
variable "replica_current_time_aggregator" { variable "replica_current_time_aggregator" {
description = "Monitor aggregator for Current replica [available values: min, max or avg]" description = "Monitor aggregator for Current replica [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "replica_current_timeframe" { variable "replica_current_timeframe" {
description = "Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -192,3 +192,4 @@ variable "replica_current_threshold_critical" {
default = 1 default = 1
description = "Current replica critical threshold" description = "Current replica critical threshold"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "kubernetes" resource = "kubernetes"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,20 +1,19 @@
resource "datadog_monitor" "job" { resource "datadog_monitor" "job" {
count = "${var.job_enabled == "true" ? 1 : 0}" count = var.job_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes job failed" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes job failed"
message = "${coalesce(var.job_message, var.message)}" message = coalesce(var.job_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes_state.job.complete"${module.filter-tags.service_check}.by("job_name").last(6).count_by_status() "kubernetes_state.job.complete"${module.filter-tags.service_check}.by("job_name").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.job_threshold_warning}" warning = var.job_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -23,26 +22,25 @@ resource "datadog_monitor" "job" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform", "${var.job_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform"], var.job_extra_tags)
} }
resource "datadog_monitor" "cronjob" { resource "datadog_monitor" "cronjob" {
count = "${var.cronjob_enabled == "true" ? 1 : 0}" count = var.cronjob_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes cronjob scheduling failed" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes cronjob scheduling failed"
message = "${coalesce(var.cronjob_message, var.message)}" message = coalesce(var.cronjob_message, var.message)
type = "service check"
type = "service check"
query = <<EOQ query = <<EOQ
"kubernetes_state.cronjob.on_schedule_check"${module.filter-tags.service_check}.by("cronjob").last(6).count_by_status() "kubernetes_state.cronjob.on_schedule_check"${module.filter-tags.service_check}.by("cronjob").last(6).count_by_status()
EOQ EOQ
thresholds = { thresholds = {
warning = "${var.cronjob_threshold_warning}" warning = var.cronjob_threshold_warning
critical = 5 critical = 5
} }
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -51,29 +49,28 @@ resource "datadog_monitor" "cronjob" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform", "${var.cronjob_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform"], var.cronjob_extra_tags)
} }
resource "datadog_monitor" "replica_available" { resource "datadog_monitor" "replica_available" {
count = "${var.replica_available_enabled == "true" ? 1 : 0}" count = var.replica_available_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Available replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Available replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.replica_available_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.replica_available_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}): ${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
max:kubernetes_state.deployment.replicas_desired${module.filter-tags.query_alert} by {namespace, deployment} - max:kubernetes_state.deployment.replicas_desired${module.filter-tags.query_alert} by {namespace, deployment} -
max:kubernetes_state.deployment.replicas_available${module.filter-tags.query_alert} by {namespace, deployment} max:kubernetes_state.deployment.replicas_available${module.filter-tags.query_alert} by {namespace, deployment}
+ 1 < ${var.replica_available_threshold_critical} + 1 < ${var.replica_available_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.replica_available_threshold_critical}" critical = var.replica_available_threshold_critical
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -82,29 +79,28 @@ resource "datadog_monitor" "replica_available" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform", "${var.replica_available_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform"], var.replica_available_extra_tags)
} }
resource "datadog_monitor" "replica_ready" { resource "datadog_monitor" "replica_ready" {
count = "${var.replica_ready_enabled == "true" ? 1 : 0}" count = var.replica_ready_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Ready replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Ready replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.replica_ready_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.replica_ready_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}): ${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} - max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} -
max:kubernetes_state.replicaset.replicas_ready${module.filter-tags.query_alert} by {namespace, replicaset} max:kubernetes_state.replicaset.replicas_ready${module.filter-tags.query_alert} by {namespace, replicaset}
+ 1 < ${var.replica_available_threshold_critical} + 1 < ${var.replica_available_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.replica_ready_threshold_critical}" critical = var.replica_ready_threshold_critical
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -113,29 +109,28 @@ resource "datadog_monitor" "replica_ready" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform", "${var.replica_ready_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform"], var.replica_ready_extra_tags)
} }
resource "datadog_monitor" "replica_current" { resource "datadog_monitor" "replica_current" {
count = "${var.replica_current_enabled == "true" ? 1 : 0}" count = var.replica_current_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Current replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Current replicas {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.replica_current_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.replica_current_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.replica_available_time_aggregator}(${var.replica_available_timeframe}): ${var.replica_available_time_aggregator}(${var.replica_available_timeframe}):
max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} - max:kubernetes_state.replicaset.replicas_desired${module.filter-tags.query_alert} by {namespace, replicaset} -
max:kubernetes_state.replicaset.replicas${module.filter-tags.query_alert} by {namespace, replicaset} max:kubernetes_state.replicaset.replicas${module.filter-tags.query_alert} by {namespace, replicaset}
+ 1 < ${var.replica_available_threshold_critical} + 1 < ${var.replica_available_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.replica_current_threshold_critical}" critical = var.replica_current_threshold_critical
} }
evaluation_delay = "${var.evaluation_delay}" evaluation_delay = var.evaluation_delay
new_host_delay = "${var.new_host_delay}" new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
@ -144,5 +139,6 @@ resource "datadog_monitor" "replica_current" {
locked = false locked = false
require_full_window = true require_full_window = true
tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform", "${var.replica_current_extra_tags}"] tags = concat(["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-workload", "team:claranet", "created-by:terraform"], var.replica_current_extra_tags)
} }

View File

@ -1,24 +1,25 @@
output "job_id" { output "job_id" {
description = "id for monitor job" description = "id for monitor job"
value = "${datadog_monitor.job.*.id}" value = datadog_monitor.job.*.id
} }
output "cronjob_id" { output "cronjob_id" {
description = "id for monitor cronjob" description = "id for monitor cronjob"
value = "${datadog_monitor.cronjob.*.id}" value = datadog_monitor.cronjob.*.id
} }
output "replica_available_id" { output "replica_available_id" {
description = "id for monitor replica_available" description = "id for monitor replica_available"
value = "${datadog_monitor.replica_available.*.id}" value = datadog_monitor.replica_available.*.id
} }
output "replica_ready_id" { output "replica_ready_id" {
description = "id for monitor replica_ready" description = "id for monitor replica_ready"
value = "${datadog_monitor.replica_ready.*.id}" value = datadog_monitor.replica_ready.*.id
} }
output "replica_current_id" { output "replica_current_id" {
description = "id for monitor replica_current" description = "id for monitor replica_current"
value = "${datadog_monitor.replica_current.*.id}" value = datadog_monitor.replica_current.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-alb" { module "datadog-monitors-cloud-aws-alb" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/alb?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/alb?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -28,7 +28,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| alb\_no\_healthy\_instances\_enabled | Flag to enable ALB no healthy instances monitor | string | `"true"` | no | | alb\_no\_healthy\_instances\_enabled | Flag to enable ALB no healthy instances monitor | string | `"true"` | no |
| alb\_no\_healthy\_instances\_extra\_tags | Extra tags for ALB no healthy instances monitor | list | `[]` | no | | alb\_no\_healthy\_instances\_extra\_tags | Extra tags for ALB no healthy instances monitor | list(string) | `[]` | no |
| alb\_no\_healthy\_instances\_message | Custom message for ALB no healthy instances monitor | string | `""` | no | | alb\_no\_healthy\_instances\_message | Custom message for ALB no healthy instances monitor | string | `""` | no |
| alb\_no\_healthy\_instances\_time\_aggregator | Monitor aggregator for ALB no healthy instances [available values: min, max or avg] | string | `"min"` | no | | alb\_no\_healthy\_instances\_time\_aggregator | Monitor aggregator for ALB no healthy instances [available values: min, max or avg] | string | `"min"` | no |
| alb\_no\_healthy\_instances\_timeframe | Monitor timeframe for ALB no healthy instances [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | alb\_no\_healthy\_instances\_timeframe | Monitor timeframe for ALB no healthy instances [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
@ -39,35 +39,35 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| httpcode\_alb\_4xx\_enabled | Flag to enable ALB httpcode 4xx monitor | string | `"true"` | no | | httpcode\_alb\_4xx\_enabled | Flag to enable ALB httpcode 4xx monitor | string | `"true"` | no |
| httpcode\_alb\_4xx\_extra\_tags | Extra tags for ALB httpcode 4xx monitor | list | `[]` | no | | httpcode\_alb\_4xx\_extra\_tags | Extra tags for ALB httpcode 4xx monitor | list(string) | `[]` | no |
| httpcode\_alb\_4xx\_message | Custom message for ALB httpcode 4xx monitor | string | `""` | no | | httpcode\_alb\_4xx\_message | Custom message for ALB httpcode 4xx monitor | string | `""` | no |
| httpcode\_alb\_4xx\_threshold\_critical | loadbalancer 4xx critical threshold in percentage | string | `"80"` | no | | httpcode\_alb\_4xx\_threshold\_critical | loadbalancer 4xx critical threshold in percentage | string | `"80"` | no |
| httpcode\_alb\_4xx\_threshold\_warning | loadbalancer 4xx warning threshold in percentage | string | `"60"` | no | | httpcode\_alb\_4xx\_threshold\_warning | loadbalancer 4xx warning threshold in percentage | string | `"60"` | no |
| httpcode\_alb\_4xx\_time\_aggregator | Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg] | string | `"min"` | no | | httpcode\_alb\_4xx\_time\_aggregator | Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg] | string | `"min"` | no |
| httpcode\_alb\_4xx\_timeframe | Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | httpcode\_alb\_4xx\_timeframe | Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| httpcode\_alb\_5xx\_enabled | Flag to enable ALB httpcode 5xx monitor | string | `"true"` | no | | httpcode\_alb\_5xx\_enabled | Flag to enable ALB httpcode 5xx monitor | string | `"true"` | no |
| httpcode\_alb\_5xx\_extra\_tags | Extra tags for ALB httpcode 5xx monitor | list | `[]` | no | | httpcode\_alb\_5xx\_extra\_tags | Extra tags for ALB httpcode 5xx monitor | list(string) | `[]` | no |
| httpcode\_alb\_5xx\_message | Custom message for ALB httpcode 5xx monitor | string | `""` | no | | httpcode\_alb\_5xx\_message | Custom message for ALB httpcode 5xx monitor | string | `""` | no |
| httpcode\_alb\_5xx\_threshold\_critical | loadbalancer 5xx critical threshold in percentage | string | `"80"` | no | | httpcode\_alb\_5xx\_threshold\_critical | loadbalancer 5xx critical threshold in percentage | string | `"80"` | no |
| httpcode\_alb\_5xx\_threshold\_warning | loadbalancer 5xx warning threshold in percentage | string | `"60"` | no | | httpcode\_alb\_5xx\_threshold\_warning | loadbalancer 5xx warning threshold in percentage | string | `"60"` | no |
| httpcode\_alb\_5xx\_time\_aggregator | Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg] | string | `"min"` | no | | httpcode\_alb\_5xx\_time\_aggregator | Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg] | string | `"min"` | no |
| httpcode\_alb\_5xx\_timeframe | Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | httpcode\_alb\_5xx\_timeframe | Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| httpcode\_target\_4xx\_enabled | Flag to enable ALB target httpcode 4xx monitor | string | `"true"` | no | | httpcode\_target\_4xx\_enabled | Flag to enable ALB target httpcode 4xx monitor | string | `"true"` | no |
| httpcode\_target\_4xx\_extra\_tags | Extra tags for ALB target httpcode 4xx monitor | list | `[]` | no | | httpcode\_target\_4xx\_extra\_tags | Extra tags for ALB target httpcode 4xx monitor | list(string) | `[]` | no |
| httpcode\_target\_4xx\_message | Custom message for ALB target httpcode 4xx monitor | string | `""` | no | | httpcode\_target\_4xx\_message | Custom message for ALB target httpcode 4xx monitor | string | `""` | no |
| httpcode\_target\_4xx\_threshold\_critical | target 4xx critical threshold in percentage | string | `"80"` | no | | httpcode\_target\_4xx\_threshold\_critical | target 4xx critical threshold in percentage | string | `"80"` | no |
| httpcode\_target\_4xx\_threshold\_warning | target 4xx warning threshold in percentage | string | `"60"` | no | | httpcode\_target\_4xx\_threshold\_warning | target 4xx warning threshold in percentage | string | `"60"` | no |
| httpcode\_target\_4xx\_time\_aggregator | Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg] | string | `"min"` | no | | httpcode\_target\_4xx\_time\_aggregator | Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg] | string | `"min"` | no |
| httpcode\_target\_4xx\_timeframe | Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | httpcode\_target\_4xx\_timeframe | Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| httpcode\_target\_5xx\_enabled | Flag to enable ALB target httpcode 5xx monitor | string | `"true"` | no | | httpcode\_target\_5xx\_enabled | Flag to enable ALB target httpcode 5xx monitor | string | `"true"` | no |
| httpcode\_target\_5xx\_extra\_tags | Extra tags for ALB target httpcode 5xx monitor | list | `[]` | no | | httpcode\_target\_5xx\_extra\_tags | Extra tags for ALB target httpcode 5xx monitor | list(string) | `[]` | no |
| httpcode\_target\_5xx\_message | Custom message for ALB target httpcode 5xx monitor | string | `""` | no | | httpcode\_target\_5xx\_message | Custom message for ALB target httpcode 5xx monitor | string | `""` | no |
| httpcode\_target\_5xx\_threshold\_critical | target 5xx critical threshold in percentage | string | `"80"` | no | | httpcode\_target\_5xx\_threshold\_critical | target 5xx critical threshold in percentage | string | `"80"` | no |
| httpcode\_target\_5xx\_threshold\_warning | target 5xx warning threshold in percentage | string | `"60"` | no | | httpcode\_target\_5xx\_threshold\_warning | target 5xx warning threshold in percentage | string | `"60"` | no |
| httpcode\_target\_5xx\_time\_aggregator | Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg] | string | `"min"` | no | | httpcode\_target\_5xx\_time\_aggregator | Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg] | string | `"min"` | no |
| httpcode\_target\_5xx\_timeframe | Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | httpcode\_target\_5xx\_timeframe | Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| latency\_enabled | Flag to enable ALB latency monitor | string | `"true"` | no | | latency\_enabled | Flag to enable ALB latency monitor | string | `"true"` | no |
| latency\_extra\_tags | Extra tags for ALB latency monitor | list | `[]` | no | | latency\_extra\_tags | Extra tags for ALB latency monitor | list(string) | `[]` | no |
| latency\_message | Custom message for ALB latency monitor | string | `""` | no | | latency\_message | Custom message for ALB latency monitor | string | `""` | no |
| latency\_threshold\_critical | latency critical threshold in milliseconds | string | `"1000"` | no | | latency\_threshold\_critical | latency critical threshold in milliseconds | string | `"1000"` | no |
| latency\_threshold\_warning | latency warning threshold in milliseconds | string | `"500"` | no | | latency\_threshold\_warning | latency warning threshold in milliseconds | string | `"500"` | no |

View File

@ -2,7 +2,7 @@
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = string
} }
variable "filter_tags_use_defaults" { variable "filter_tags_use_defaults" {
@ -43,61 +43,61 @@ variable "prefix_slug" {
variable "alb_no_healthy_instances_enabled" { variable "alb_no_healthy_instances_enabled" {
description = "Flag to enable ALB no healthy instances monitor" description = "Flag to enable ALB no healthy instances monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "alb_no_healthy_instances_extra_tags" { variable "alb_no_healthy_instances_extra_tags" {
description = "Extra tags for ALB no healthy instances monitor" description = "Extra tags for ALB no healthy instances monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "alb_no_healthy_instances_message" { variable "alb_no_healthy_instances_message" {
description = "Custom message for ALB no healthy instances monitor" description = "Custom message for ALB no healthy instances monitor"
type = "string" type = string
default = "" default = ""
} }
variable "alb_no_healthy_instances_time_aggregator" { variable "alb_no_healthy_instances_time_aggregator" {
description = "Monitor aggregator for ALB no healthy instances [available values: min, max or avg]" description = "Monitor aggregator for ALB no healthy instances [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "alb_no_healthy_instances_timeframe" { variable "alb_no_healthy_instances_timeframe" {
description = "Monitor timeframe for ALB no healthy instances [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB no healthy instances [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "latency_enabled" { variable "latency_enabled" {
description = "Flag to enable ALB latency monitor" description = "Flag to enable ALB latency monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "latency_extra_tags" { variable "latency_extra_tags" {
description = "Extra tags for ALB latency monitor" description = "Extra tags for ALB latency monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "latency_message" { variable "latency_message" {
description = "Custom message for ALB latency monitor" description = "Custom message for ALB latency monitor"
type = "string" type = string
default = "" default = ""
} }
variable "latency_time_aggregator" { variable "latency_time_aggregator" {
description = "Monitor aggregator for ALB latency [available values: min, max or avg]" description = "Monitor aggregator for ALB latency [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "latency_timeframe" { variable "latency_timeframe" {
description = "Monitor timeframe for ALB latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -113,31 +113,31 @@ variable "latency_threshold_warning" {
variable "httpcode_alb_4xx_enabled" { variable "httpcode_alb_4xx_enabled" {
description = "Flag to enable ALB httpcode 4xx monitor" description = "Flag to enable ALB httpcode 4xx monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "httpcode_alb_4xx_extra_tags" { variable "httpcode_alb_4xx_extra_tags" {
description = "Extra tags for ALB httpcode 4xx monitor" description = "Extra tags for ALB httpcode 4xx monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "httpcode_alb_4xx_message" { variable "httpcode_alb_4xx_message" {
description = "Custom message for ALB httpcode 4xx monitor" description = "Custom message for ALB httpcode 4xx monitor"
type = "string" type = string
default = "" default = ""
} }
variable "httpcode_alb_4xx_time_aggregator" { variable "httpcode_alb_4xx_time_aggregator" {
description = "Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg]" description = "Monitor aggregator for ALB httpcode 4xx [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "httpcode_alb_4xx_timeframe" { variable "httpcode_alb_4xx_timeframe" {
description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -153,31 +153,31 @@ variable "httpcode_alb_4xx_threshold_warning" {
variable "httpcode_target_4xx_enabled" { variable "httpcode_target_4xx_enabled" {
description = "Flag to enable ALB target httpcode 4xx monitor" description = "Flag to enable ALB target httpcode 4xx monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "httpcode_target_4xx_extra_tags" { variable "httpcode_target_4xx_extra_tags" {
description = "Extra tags for ALB target httpcode 4xx monitor" description = "Extra tags for ALB target httpcode 4xx monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "httpcode_target_4xx_message" { variable "httpcode_target_4xx_message" {
description = "Custom message for ALB target httpcode 4xx monitor" description = "Custom message for ALB target httpcode 4xx monitor"
type = "string" type = string
default = "" default = ""
} }
variable "httpcode_target_4xx_time_aggregator" { variable "httpcode_target_4xx_time_aggregator" {
description = "Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg]" description = "Monitor aggregator for ALB target httpcode 4xx [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "httpcode_target_4xx_timeframe" { variable "httpcode_target_4xx_timeframe" {
description = "Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB target httpcode 4xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -193,31 +193,31 @@ variable "httpcode_target_4xx_threshold_warning" {
variable "httpcode_alb_5xx_enabled" { variable "httpcode_alb_5xx_enabled" {
description = "Flag to enable ALB httpcode 5xx monitor" description = "Flag to enable ALB httpcode 5xx monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "httpcode_alb_5xx_extra_tags" { variable "httpcode_alb_5xx_extra_tags" {
description = "Extra tags for ALB httpcode 5xx monitor" description = "Extra tags for ALB httpcode 5xx monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "httpcode_alb_5xx_message" { variable "httpcode_alb_5xx_message" {
description = "Custom message for ALB httpcode 5xx monitor" description = "Custom message for ALB httpcode 5xx monitor"
type = "string" type = string
default = "" default = ""
} }
variable "httpcode_alb_5xx_time_aggregator" { variable "httpcode_alb_5xx_time_aggregator" {
description = "Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg]" description = "Monitor aggregator for ALB httpcode 5xx [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "httpcode_alb_5xx_timeframe" { variable "httpcode_alb_5xx_timeframe" {
description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -233,31 +233,31 @@ variable "httpcode_alb_5xx_threshold_warning" {
variable "httpcode_target_5xx_enabled" { variable "httpcode_target_5xx_enabled" {
description = "Flag to enable ALB target httpcode 5xx monitor" description = "Flag to enable ALB target httpcode 5xx monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "httpcode_target_5xx_extra_tags" { variable "httpcode_target_5xx_extra_tags" {
description = "Extra tags for ALB target httpcode 5xx monitor" description = "Extra tags for ALB target httpcode 5xx monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "httpcode_target_5xx_message" { variable "httpcode_target_5xx_message" {
description = "Custom message for ALB target httpcode 5xx monitor" description = "Custom message for ALB target httpcode 5xx monitor"
type = "string" type = string
default = "" default = ""
} }
variable "httpcode_target_5xx_time_aggregator" { variable "httpcode_target_5xx_time_aggregator" {
description = "Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg]" description = "Monitor aggregator for ALB target httpcode 5xx [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "httpcode_target_5xx_timeframe" { variable "httpcode_target_5xx_timeframe" {
description = "Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ALB target httpcode 5xx [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -275,3 +275,4 @@ variable "artificial_requests_count" {
default = 5 default = 5
description = "Number of false requests used to mitigate false positive in case of low trafic" description = "Number of false requests used to mitigate false positive in case of low trafic"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_alb" resource = "aws_alb"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,8 +1,8 @@
resource "datadog_monitor" "ALB_no_healthy_instances" { resource "datadog_monitor" "ALB_no_healthy_instances" {
count = "${var.alb_no_healthy_instances_enabled == "true" ? 1 : 0}" count = var.alb_no_healthy_instances_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%{{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%%{{/is_warning}}"
message = coalesce(var.alb_no_healthy_instances_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.alb_no_healthy_instances_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): ( ${var.alb_no_healthy_instances_time_aggregator}(${var.alb_no_healthy_instances_timeframe}): (
@ -10,170 +10,165 @@ resource "datadog_monitor" "ALB_no_healthy_instances" {
sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} + sum:aws.applicationelb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} +
sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} ) sum:aws.applicationelb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancer} )
) * 100 < 1 ) * 100 < 1
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}"
thresholds {
critical = 1 critical = 1
warning = 100 warning = 100
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.alb_no_healthy_instances_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.alb_no_healthy_instances_extra_tags)
} }
resource "datadog_monitor" "ALB_latency" { resource "datadog_monitor" "ALB_latency" {
count = "${var.latency_enabled == "true" ? 1 : 0}" count = var.latency_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB latency {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = coalesce(var.latency_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.latency_time_aggregator}(${var.latency_timeframe}): ${var.latency_time_aggregator}(${var.latency_timeframe}):
default(avg:aws.applicationelb.target_response_time.average${module.filter-tags.query_alert} by {region,loadbalancer}, 0) default(avg:aws.applicationelb.target_response_time.average${module.filter-tags.query_alert} by {region,loadbalancer}, 0)
> ${var.latency_threshold_critical} > ${var.latency_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" critical = var.latency_threshold_critical
warning = var.latency_threshold_warning
thresholds {
critical = "${var.latency_threshold_critical}"
warning = "${var.latency_threshold_warning}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.latency_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.latency_extra_tags)
} }
resource "datadog_monitor" "ALB_httpcode_5xx" { resource "datadog_monitor" "ALB_httpcode_5xx" {
count = "${var.httpcode_alb_5xx_enabled == "true" ? 1 : 0}" count = var.httpcode_alb_5xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.httpcode_alb_5xx_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.httpcode_alb_5xx_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}): ${var.httpcode_alb_5xx_time_aggregator}(${var.httpcode_alb_5xx_timeframe}):
default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / ( default(avg:aws.applicationelb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.httpcode_alb_5xx_threshold_critical} * 100 > ${var.httpcode_alb_5xx_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" critical = var.httpcode_alb_5xx_threshold_critical
warning = var.httpcode_alb_5xx_threshold_warning
thresholds {
critical = "${var.httpcode_alb_5xx_threshold_critical}"
warning = "${var.httpcode_alb_5xx_threshold_warning}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.httpcode_alb_5xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.httpcode_alb_5xx_extra_tags)
} }
resource "datadog_monitor" "ALB_httpcode_4xx" { resource "datadog_monitor" "ALB_httpcode_4xx" {
count = "${var.httpcode_alb_4xx_enabled == "true" ? 1 : 0}" count = var.httpcode_alb_4xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.httpcode_alb_4xx_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.httpcode_alb_4xx_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}): ${var.httpcode_alb_4xx_time_aggregator}(${var.httpcode_alb_4xx_timeframe}):
default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / ( default(avg:aws.applicationelb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.httpcode_alb_4xx_threshold_critical} * 100 > ${var.httpcode_alb_4xx_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" critical = var.httpcode_alb_4xx_threshold_critical
warning = var.httpcode_alb_4xx_threshold_warning
thresholds {
critical = "${var.httpcode_alb_4xx_threshold_critical}"
warning = "${var.httpcode_alb_4xx_threshold_warning}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.httpcode_alb_4xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.httpcode_alb_4xx_extra_tags)
} }
resource "datadog_monitor" "ALB_httpcode_target_5xx" { resource "datadog_monitor" "ALB_httpcode_target_5xx" {
count = "${var.httpcode_target_5xx_enabled == "true" ? 1 : 0}" count = var.httpcode_target_5xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB target HTTP code 5xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.httpcode_target_5xx_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.httpcode_target_5xx_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.httpcode_target_5xx_time_aggregator}(${var.httpcode_target_5xx_timeframe}): ${var.httpcode_target_5xx_time_aggregator}(${var.httpcode_target_5xx_timeframe}):
default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / ( default(avg:aws.applicationelb.httpcode_target_5xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.httpcode_target_5xx_threshold_critical} * 100 > ${var.httpcode_target_5xx_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" critical = var.httpcode_target_5xx_threshold_critical
warning = var.httpcode_target_5xx_threshold_warning
thresholds {
critical = "${var.httpcode_target_5xx_threshold_critical}"
warning = "${var.httpcode_target_5xx_threshold_warning}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.httpcode_target_5xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.httpcode_target_5xx_extra_tags)
} }
resource "datadog_monitor" "ALB_httpcode_target_4xx" { resource "datadog_monitor" "ALB_httpcode_target_4xx" {
count = "${var.httpcode_target_4xx_enabled == "true" ? 1 : 0}" count = var.httpcode_target_4xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ALB target HTTP code 4xx {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.httpcode_target_4xx_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.httpcode_target_4xx_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.httpcode_target_4xx_time_aggregator}(${var.httpcode_target_4xx_timeframe}): ${var.httpcode_target_4xx_time_aggregator}(${var.httpcode_target_4xx_timeframe}):
default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / ( default(avg:aws.applicationelb.httpcode_target_4xx${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.applicationelb.request_count${module.filter-tags.query_alert} by {region,loadbalancer}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.httpcode_target_4xx_threshold_critical} * 100 > ${var.httpcode_target_4xx_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" critical = var.httpcode_target_4xx_threshold_critical
warning = var.httpcode_target_4xx_threshold_warning
thresholds {
critical = "${var.httpcode_target_4xx_threshold_critical}"
warning = "${var.httpcode_target_4xx_threshold_warning}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform", "${var.httpcode_target_4xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:alb", "team:claranet", "created-by:terraform"], var.httpcode_target_4xx_extra_tags)
} }

View File

@ -1,29 +1,30 @@
output "ALB_no_healthy_instances_id" { output "ALB_no_healthy_instances_id" {
description = "id for monitor ALB_no_healthy_instances" description = "id for monitor ALB_no_healthy_instances"
value = "${datadog_monitor.ALB_no_healthy_instances.*.id}" value = datadog_monitor.ALB_no_healthy_instances.*.id
} }
output "ALB_latency_id" { output "ALB_latency_id" {
description = "id for monitor ALB_latency" description = "id for monitor ALB_latency"
value = "${datadog_monitor.ALB_latency.*.id}" value = datadog_monitor.ALB_latency.*.id
} }
output "ALB_httpcode_5xx_id" { output "ALB_httpcode_5xx_id" {
description = "id for monitor ALB_httpcode_5xx" description = "id for monitor ALB_httpcode_5xx"
value = "${datadog_monitor.ALB_httpcode_5xx.*.id}" value = datadog_monitor.ALB_httpcode_5xx.*.id
} }
output "ALB_httpcode_4xx_id" { output "ALB_httpcode_4xx_id" {
description = "id for monitor ALB_httpcode_4xx" description = "id for monitor ALB_httpcode_4xx"
value = "${datadog_monitor.ALB_httpcode_4xx.*.id}" value = datadog_monitor.ALB_httpcode_4xx.*.id
} }
output "ALB_httpcode_target_5xx_id" { output "ALB_httpcode_target_5xx_id" {
description = "id for monitor ALB_httpcode_target_5xx" description = "id for monitor ALB_httpcode_target_5xx"
value = "${datadog_monitor.ALB_httpcode_target_5xx.*.id}" value = datadog_monitor.ALB_httpcode_target_5xx.*.id
} }
output "ALB_httpcode_target_4xx_id" { output "ALB_httpcode_target_4xx_id" {
description = "id for monitor ALB_httpcode_target_4xx" description = "id for monitor ALB_httpcode_target_4xx"
value = "${datadog_monitor.ALB_httpcode_target_4xx.*.id}" value = datadog_monitor.ALB_httpcode_target_4xx.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-apigateway" { module "datadog-monitors-cloud-aws-apigateway" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/apigateway?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/apigateway?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -29,21 +29,21 @@ Creates DataDog monitors with the following checks:
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | | evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags | Tags used for filtering | string | `"*"` | no | | filter\_tags | Tags used for filtering | string | `"*"` | no |
| http\_4xx\_requests\_enabled | Flag to enable API Gateway HTTP 4xx requests monitor | string | `"true"` | no | | http\_4xx\_requests\_enabled | Flag to enable API Gateway HTTP 4xx requests monitor | string | `"true"` | no |
| http\_4xx\_requests\_extra\_tags | Extra tags for API Gateway HTTP 4xx requests monitor | list | `[]` | no | | http\_4xx\_requests\_extra\_tags | Extra tags for API Gateway HTTP 4xx requests monitor | list(string) | `[]` | no |
| http\_4xx\_requests\_message | Custom message for API Gateway HTTP 4xx requests monitor | string | `""` | no | | http\_4xx\_requests\_message | Custom message for API Gateway HTTP 4xx requests monitor | string | `""` | no |
| http\_4xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 4xx errors | string | `"30"` | no | | http\_4xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 4xx errors | string | `"30"` | no |
| http\_4xx\_requests\_threshold\_warning | Maximum warning acceptable percent of 4xx errors | string | `"15"` | no | | http\_4xx\_requests\_threshold\_warning | Maximum warning acceptable percent of 4xx errors | string | `"15"` | no |
| http\_4xx\_requests\_time\_aggregator | Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg] | string | `"min"` | no | | http\_4xx\_requests\_time\_aggregator | Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg] | string | `"min"` | no |
| http\_4xx\_requests\_timeframe | Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | http\_4xx\_requests\_timeframe | Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| http\_5xx\_requests\_enabled | Flag to enable API Gateway HTTP 5xx requests monitor | string | `"true"` | no | | http\_5xx\_requests\_enabled | Flag to enable API Gateway HTTP 5xx requests monitor | string | `"true"` | no |
| http\_5xx\_requests\_extra\_tags | Extra tags for API Gateway HTTP 5xx requests monitor | list | `[]` | no | | http\_5xx\_requests\_extra\_tags | Extra tags for API Gateway HTTP 5xx requests monitor | list(string) | `[]` | no |
| http\_5xx\_requests\_message | Custom message for API Gateway HTTP 5xx requests monitor | string | `""` | no | | http\_5xx\_requests\_message | Custom message for API Gateway HTTP 5xx requests monitor | string | `""` | no |
| http\_5xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 5xx errors | string | `"20"` | no | | http\_5xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 5xx errors | string | `"20"` | no |
| http\_5xx\_requests\_threshold\_warning | Maximum warning acceptable percent of 5xx errors | string | `"10"` | no | | http\_5xx\_requests\_threshold\_warning | Maximum warning acceptable percent of 5xx errors | string | `"10"` | no |
| http\_5xx\_requests\_time\_aggregator | Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg] | string | `"min"` | no | | http\_5xx\_requests\_time\_aggregator | Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg] | string | `"min"` | no |
| http\_5xx\_requests\_timeframe | Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | http\_5xx\_requests\_timeframe | Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| latency\_enabled | Flag to enable API Gateway latency monitor | string | `"true"` | no | | latency\_enabled | Flag to enable API Gateway latency monitor | string | `"true"` | no |
| latency\_extra\_tags | Extra tags for API Gateway latency monitor | list | `[]` | no | | latency\_extra\_tags | Extra tags for API Gateway latency monitor | list(string) | `[]` | no |
| latency\_message | Custom message for API Gateway latency monitor | string | `""` | no | | latency\_message | Custom message for API Gateway latency monitor | string | `""` | no |
| latency\_threshold\_critical | Alerting threshold in milliseconds | string | `"800"` | no | | latency\_threshold\_critical | Alerting threshold in milliseconds | string | `"800"` | no |
| latency\_threshold\_warning | Warning threshold in milliseconds | string | `"400"` | no | | latency\_threshold\_warning | Warning threshold in milliseconds | string | `"400"` | no |

View File

@ -1,6 +1,6 @@
variable "environment" { variable "environment" {
description = "Environment" description = "Environment"
type = "string" type = string
} }
variable "filter_tags" { variable "filter_tags" {
@ -33,31 +33,31 @@ variable "prefix_slug" {
variable "latency_enabled" { variable "latency_enabled" {
description = "Flag to enable API Gateway latency monitor" description = "Flag to enable API Gateway latency monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "latency_extra_tags" { variable "latency_extra_tags" {
description = "Extra tags for API Gateway latency monitor" description = "Extra tags for API Gateway latency monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "latency_message" { variable "latency_message" {
description = "Custom message for API Gateway latency monitor" description = "Custom message for API Gateway latency monitor"
type = "string" type = string
default = "" default = ""
} }
variable "latency_time_aggregator" { variable "latency_time_aggregator" {
description = "Monitor aggregator for API Gateway latency [available values: min, max or avg]" description = "Monitor aggregator for API Gateway latency [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "latency_timeframe" { variable "latency_timeframe" {
description = "Monitor timeframe for API latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -77,31 +77,31 @@ variable "latency_threshold_warning" {
variable "http_5xx_requests_enabled" { variable "http_5xx_requests_enabled" {
description = "Flag to enable API Gateway HTTP 5xx requests monitor" description = "Flag to enable API Gateway HTTP 5xx requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "http_5xx_requests_extra_tags" { variable "http_5xx_requests_extra_tags" {
description = "Extra tags for API Gateway HTTP 5xx requests monitor" description = "Extra tags for API Gateway HTTP 5xx requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "http_5xx_requests_message" { variable "http_5xx_requests_message" {
description = "Custom message for API Gateway HTTP 5xx requests monitor" description = "Custom message for API Gateway HTTP 5xx requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "http_5xx_requests_time_aggregator" { variable "http_5xx_requests_time_aggregator" {
description = "Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg]" description = "Monitor aggregator for API HTTP 5xx requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "http_5xx_requests_timeframe" { variable "http_5xx_requests_timeframe" {
description = "Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API HTTP 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -121,31 +121,31 @@ variable "http_5xx_requests_threshold_warning" {
variable "http_4xx_requests_enabled" { variable "http_4xx_requests_enabled" {
description = "Flag to enable API Gateway HTTP 4xx requests monitor" description = "Flag to enable API Gateway HTTP 4xx requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "http_4xx_requests_extra_tags" { variable "http_4xx_requests_extra_tags" {
description = "Extra tags for API Gateway HTTP 4xx requests monitor" description = "Extra tags for API Gateway HTTP 4xx requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "http_4xx_requests_message" { variable "http_4xx_requests_message" {
description = "Custom message for API Gateway HTTP 4xx requests monitor" description = "Custom message for API Gateway HTTP 4xx requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "http_4xx_requests_time_aggregator" { variable "http_4xx_requests_time_aggregator" {
description = "Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg]" description = "Monitor aggregator for API HTTP 4xx requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "http_4xx_requests_timeframe" { variable "http_4xx_requests_timeframe" {
description = "Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API HTTP 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -163,3 +163,4 @@ variable "artificial_requests_count" {
default = 5 default = 5
description = "Number of false requests used to mitigate false positive in case of low trafic" description = "Number of false requests used to mitigate false positive in case of low trafic"
} }

View File

@ -1,91 +1,89 @@
# Monitoring Api Gateway latency # Monitoring Api Gateway latency
resource "datadog_monitor" "API_Gateway_latency" { resource "datadog_monitor" "API_Gateway_latency" {
count = "${var.latency_enabled == "true" ? 1 : 0}" count = var.latency_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}"
message = coalesce(var.latency_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.latency_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.latency_time_aggregator}(${var.latency_timeframe}): ${var.latency_time_aggregator}(${var.latency_timeframe}):
default(avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname,stage}, 0) default(avg:aws.apigateway.latency{${var.filter_tags}} by {region,apiname,stage}, 0)
> ${var.latency_threshold_critical} > ${var.latency_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.latency_threshold_warning
critical = var.latency_threshold_critical
thresholds {
warning = "${var.latency_threshold_warning}"
critical = "${var.latency_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform", "${var.latency_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform"], var.latency_extra_tags)
} }
# Monitoring API Gateway 5xx errors percent # Monitoring API Gateway 5xx errors percent
resource "datadog_monitor" "API_http_5xx_errors_count" { resource "datadog_monitor" "API_http_5xx_errors_count" {
count = "${var.http_5xx_requests_enabled == "true" ? 1 : 0}" count = var.http_5xx_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway HTTP 5xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.http_5xx_requests_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): ${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}):
default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / ( default(avg:aws.apigateway.5xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.http_5xx_requests_threshold_critical} * 100 > ${var.http_5xx_requests_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.http_5xx_requests_threshold_warning
critical = var.http_5xx_requests_threshold_critical
thresholds {
warning = "${var.http_5xx_requests_threshold_warning}"
critical = "${var.http_5xx_requests_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform", "${var.http_5xx_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform"], var.http_5xx_requests_extra_tags)
} }
# Monitoring API Gateway 4xx errors percent # Monitoring API Gateway 4xx errors percent
resource "datadog_monitor" "API_http_4xx_errors_count" { resource "datadog_monitor" "API_http_4xx_errors_count" {
count = "${var.http_4xx_requests_enabled == "true" ? 1 : 0}" count = var.http_4xx_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Gateway HTTP 4xx errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.http_4xx_requests_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): ${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}):
default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / ( default(avg:aws.apigateway.4xxerror{${var.filter_tags}} by {region,apiname,stage}.as_rate(), 0) / (
default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.apigateway.count{${var.filter_tags}} by {region,apiname,stage}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.http_4xx_requests_threshold_critical} * 100 > ${var.http_4xx_requests_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.http_4xx_requests_threshold_warning
critical = var.http_4xx_requests_threshold_critical
thresholds {
warning = "${var.http_4xx_requests_threshold_warning}"
critical = "${var.http_4xx_requests_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform", "${var.http_4xx_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:apigateway", "team:claranet", "created-by:terraform"], var.http_4xx_requests_extra_tags)
} }

View File

@ -1,14 +1,15 @@
output "API_Gateway_latency_id" { output "API_Gateway_latency_id" {
description = "id for monitor API_Gateway_latency" description = "id for monitor API_Gateway_latency"
value = "${datadog_monitor.API_Gateway_latency.*.id}" value = datadog_monitor.API_Gateway_latency.*.id
} }
output "API_http_5xx_errors_count_id" { output "API_http_5xx_errors_count_id" {
description = "id for monitor API_http_5xx_errors_count" description = "id for monitor API_http_5xx_errors_count"
value = "${datadog_monitor.API_http_5xx_errors_count.*.id}" value = datadog_monitor.API_http_5xx_errors_count.*.id
} }
output "API_http_4xx_errors_count_id" { output "API_http_4xx_errors_count_id" {
description = "id for monitor API_http_4xx_errors_count" description = "id for monitor API_http_4xx_errors_count"
value = "${datadog_monitor.API_http_4xx_errors_count.*.id}" value = datadog_monitor.API_http_4xx_errors_count.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-elasticache-common" { module "datadog-monitors-cloud-aws-elasticache-common" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/common?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/common?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -30,10 +30,10 @@ Creates DataDog monitors with the following checks:
| environment | Infrastructure Environment | string | n/a | yes | | environment | Infrastructure Environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | | evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| eviction\_enabled | Flag to enable Elasticache eviction monitor | string | `"true"` | no | | eviction\_enabled | Flag to enable Elasticache eviction monitor | string | `"true"` | no |
| eviction\_extra\_tags | Extra tags for Elasticache eviction monitor | list | `[]` | no | | eviction\_extra\_tags | Extra tags for Elasticache eviction monitor | list(string) | `[]` | no |
| eviction\_growing\_condition\_timeframe | Monitor condition timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | eviction\_growing\_condition\_timeframe | Monitor condition timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| eviction\_growing\_enabled | Flag to enable Elasticache eviction growing monitor | string | `"true"` | no | | eviction\_growing\_enabled | Flag to enable Elasticache eviction growing monitor | string | `"true"` | no |
| eviction\_growing\_extra\_tags | Extra tags for Elasticache eviction growing monitor | list | `[]` | no | | eviction\_growing\_extra\_tags | Extra tags for Elasticache eviction growing monitor | list(string) | `[]` | no |
| eviction\_growing\_message | Custom message for Elasticache eviction growing monitor | string | `""` | no | | eviction\_growing\_message | Custom message for Elasticache eviction growing monitor | string | `""` | no |
| eviction\_growing\_threshold\_critical | Elasticache eviction growing critical threshold in percentage | string | `"30"` | no | | eviction\_growing\_threshold\_critical | Elasticache eviction growing critical threshold in percentage | string | `"30"` | no |
| eviction\_growing\_threshold\_warning | Elasticache eviction growing warning threshold in percentage | string | `"10"` | no | | eviction\_growing\_threshold\_warning | Elasticache eviction growing warning threshold in percentage | string | `"10"` | no |
@ -47,26 +47,26 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| free\_memory\_condition\_timeframe | Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | free\_memory\_condition\_timeframe | Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| free\_memory\_enabled | Flag to enable Elasticache free memory monitor | string | `"true"` | no | | free\_memory\_enabled | Flag to enable Elasticache free memory monitor | string | `"true"` | no |
| free\_memory\_extra\_tags | Extra tags for Elasticache free memory monitor | list | `[]` | no | | free\_memory\_extra\_tags | Extra tags for Elasticache free memory monitor | list(string) | `[]` | no |
| free\_memory\_message | Custom message for Elasticache free memory monitor | string | `""` | no | | free\_memory\_message | Custom message for Elasticache free memory monitor | string | `""` | no |
| free\_memory\_threshold\_critical | Elasticache free memory critical threshold in percentage | string | `"-70"` | no | | free\_memory\_threshold\_critical | Elasticache free memory critical threshold in percentage | string | `"-70"` | no |
| free\_memory\_threshold\_warning | Elasticache free memory warning threshold in percentage | string | `"-50"` | no | | free\_memory\_threshold\_warning | Elasticache free memory warning threshold in percentage | string | `"-50"` | no |
| free\_memory\_timeframe | Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | free\_memory\_timeframe | Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| max\_connection\_enabled | Flag to enable Elasticache max connection monitor | string | `"true"` | no | | max\_connection\_enabled | Flag to enable Elasticache max connection monitor | string | `"true"` | no |
| max\_connection\_extra\_tags | Extra tags for Elasticache max connection monitor | list | `[]` | no | | max\_connection\_extra\_tags | Extra tags for Elasticache max connection monitor | list(string) | `[]` | no |
| max\_connection\_message | Custom message for Elasticache max connection monitor | string | `""` | no | | max\_connection\_message | Custom message for Elasticache max connection monitor | string | `""` | no |
| max\_connection\_time\_aggregator | Monitor aggregator for Elasticache max connection [available values: min, max or avg] | string | `"max"` | no | | max\_connection\_time\_aggregator | Monitor aggregator for Elasticache max connection [available values: min, max or avg] | string | `"max"` | no |
| max\_connection\_timeframe | Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | max\_connection\_timeframe | Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| message | Message sent when an alert is triggered | string | n/a | yes | | message | Message sent when an alert is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| no\_connection\_enabled | Flag to enable Elasticache no connection monitor | string | `"true"` | no | | no\_connection\_enabled | Flag to enable Elasticache no connection monitor | string | `"true"` | no |
| no\_connection\_extra\_tags | Extra tags for Elasticache no connection monitor | list | `[]` | no | | no\_connection\_extra\_tags | Extra tags for Elasticache no connection monitor | list(string) | `[]` | no |
| no\_connection\_message | Custom message for Elasticache no connection monitor | string | `""` | no | | no\_connection\_message | Custom message for Elasticache no connection monitor | string | `""` | no |
| no\_connection\_time\_aggregator | Monitor aggregator for Elasticache no connection [available values: min, max or avg] | string | `"min"` | no | | no\_connection\_time\_aggregator | Monitor aggregator for Elasticache no connection [available values: min, max or avg] | string | `"min"` | no |
| no\_connection\_timeframe | Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | no\_connection\_timeframe | Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| swap\_enabled | Flag to enable Elasticache swap monitor | string | `"true"` | no | | swap\_enabled | Flag to enable Elasticache swap monitor | string | `"true"` | no |
| swap\_extra\_tags | Extra tags for Elasticache swap monitor | list | `[]` | no | | swap\_extra\_tags | Extra tags for Elasticache swap monitor | list(string) | `[]` | no |
| swap\_message | Custom message for Elasticache swap monitor | string | `""` | no | | swap\_message | Custom message for Elasticache swap monitor | string | `""` | no |
| swap\_threshold\_critical | Elasticache swap critical threshold in bytes | string | `"50000000"` | no | | swap\_threshold\_critical | Elasticache swap critical threshold in bytes | string | `"50000000"` | no |
| swap\_threshold\_warning | Elasticache swap warning threshold in bytes | string | `"0"` | no | | swap\_threshold\_warning | Elasticache swap warning threshold in bytes | string | `"0"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Infrastructure Environment" description = "Infrastructure Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,222 +43,223 @@ variable "filter_tags_custom_excluded" {
variable "eviction_enabled" { variable "eviction_enabled" {
description = "Flag to enable Elasticache eviction monitor" description = "Flag to enable Elasticache eviction monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "eviction_extra_tags" { variable "eviction_extra_tags" {
description = "Extra tags for Elasticache eviction monitor" description = "Extra tags for Elasticache eviction monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "eviction_message" { variable "eviction_message" {
description = "Custom message for Elasticache eviction monitor" description = "Custom message for Elasticache eviction monitor"
type = "string" type = string
default = "" default = ""
} }
variable "eviction_timeframe" { variable "eviction_timeframe" {
description = "Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache eviction [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "eviction_threshold_warning" { variable "eviction_threshold_warning" {
description = "Elasticache free memory warning threshold in percentage" description = "Elasticache free memory warning threshold in percentage"
type = "string" type = string
default = 0 default = 0
} }
variable "eviction_threshold_critical" { variable "eviction_threshold_critical" {
description = "Elasticache free memory critical threshold in percentage" description = "Elasticache free memory critical threshold in percentage"
type = "string" type = string
default = 30 default = 30
} }
variable "max_connection_enabled" { variable "max_connection_enabled" {
description = "Flag to enable Elasticache max connection monitor" description = "Flag to enable Elasticache max connection monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "max_connection_extra_tags" { variable "max_connection_extra_tags" {
description = "Extra tags for Elasticache max connection monitor" description = "Extra tags for Elasticache max connection monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "max_connection_message" { variable "max_connection_message" {
description = "Custom message for Elasticache max connection monitor" description = "Custom message for Elasticache max connection monitor"
type = "string" type = string
default = "" default = ""
} }
variable "max_connection_time_aggregator" { variable "max_connection_time_aggregator" {
description = "Monitor aggregator for Elasticache max connection [available values: min, max or avg]" description = "Monitor aggregator for Elasticache max connection [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "max_connection_timeframe" { variable "max_connection_timeframe" {
description = "Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache max connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "no_connection_enabled" { variable "no_connection_enabled" {
description = "Flag to enable Elasticache no connection monitor" description = "Flag to enable Elasticache no connection monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "no_connection_extra_tags" { variable "no_connection_extra_tags" {
description = "Extra tags for Elasticache no connection monitor" description = "Extra tags for Elasticache no connection monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "no_connection_message" { variable "no_connection_message" {
description = "Custom message for Elasticache no connection monitor" description = "Custom message for Elasticache no connection monitor"
type = "string" type = string
default = "" default = ""
} }
variable "no_connection_time_aggregator" { variable "no_connection_time_aggregator" {
description = "Monitor aggregator for Elasticache no connection [available values: min, max or avg]" description = "Monitor aggregator for Elasticache no connection [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "no_connection_timeframe" { variable "no_connection_timeframe" {
description = "Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache no connection [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "swap_enabled" { variable "swap_enabled" {
description = "Flag to enable Elasticache swap monitor" description = "Flag to enable Elasticache swap monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "swap_extra_tags" { variable "swap_extra_tags" {
description = "Extra tags for Elasticache swap monitor" description = "Extra tags for Elasticache swap monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "swap_message" { variable "swap_message" {
description = "Custom message for Elasticache swap monitor" description = "Custom message for Elasticache swap monitor"
type = "string" type = string
default = "" default = ""
} }
variable "swap_time_aggregator" { variable "swap_time_aggregator" {
description = "Monitor aggregator for Elasticache memcached swap [available values: min, max or avg]" description = "Monitor aggregator for Elasticache memcached swap [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "swap_timeframe" { variable "swap_timeframe" {
description = "Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache swap [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "swap_threshold_warning" { variable "swap_threshold_warning" {
description = "Elasticache swap warning threshold in bytes" description = "Elasticache swap warning threshold in bytes"
type = "string" type = string
default = 0 default = 0
} }
variable "swap_threshold_critical" { variable "swap_threshold_critical" {
description = "Elasticache swap critical threshold in bytes" description = "Elasticache swap critical threshold in bytes"
type = "string" type = string
default = 50000000 default = 50000000
} }
variable "free_memory_enabled" { variable "free_memory_enabled" {
description = "Flag to enable Elasticache free memory monitor" description = "Flag to enable Elasticache free memory monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "free_memory_extra_tags" { variable "free_memory_extra_tags" {
description = "Extra tags for Elasticache free memory monitor" description = "Extra tags for Elasticache free memory monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "free_memory_message" { variable "free_memory_message" {
description = "Custom message for Elasticache free memory monitor" description = "Custom message for Elasticache free memory monitor"
type = "string" type = string
default = "" default = ""
} }
variable "free_memory_condition_timeframe" { variable "free_memory_condition_timeframe" {
description = "Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor condition timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "free_memory_timeframe" { variable "free_memory_timeframe" {
description = "Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "free_memory_threshold_warning" { variable "free_memory_threshold_warning" {
description = "Elasticache free memory warning threshold in percentage" description = "Elasticache free memory warning threshold in percentage"
type = "string" type = string
default = -50 default = -50
} }
variable "free_memory_threshold_critical" { variable "free_memory_threshold_critical" {
description = "Elasticache free memory critical threshold in percentage" description = "Elasticache free memory critical threshold in percentage"
type = "string" type = string
default = -70 default = -70
} }
variable "eviction_growing_enabled" { variable "eviction_growing_enabled" {
description = "Flag to enable Elasticache eviction growing monitor" description = "Flag to enable Elasticache eviction growing monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "eviction_growing_extra_tags" { variable "eviction_growing_extra_tags" {
description = "Extra tags for Elasticache eviction growing monitor" description = "Extra tags for Elasticache eviction growing monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "eviction_growing_message" { variable "eviction_growing_message" {
description = "Custom message for Elasticache eviction growing monitor" description = "Custom message for Elasticache eviction growing monitor"
type = "string" type = string
default = "" default = ""
} }
variable "eviction_growing_condition_timeframe" { variable "eviction_growing_condition_timeframe" {
description = "Monitor condition timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor condition timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "eviction_growing_timeframe" { variable "eviction_growing_timeframe" {
description = "Monitor timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache eviction growing [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "eviction_growing_threshold_warning" { variable "eviction_growing_threshold_warning" {
description = "Elasticache eviction growing warning threshold in percentage" description = "Elasticache eviction growing warning threshold in percentage"
type = "string" type = string
default = 10 default = 10
} }
variable "eviction_growing_threshold_critical" { variable "eviction_growing_threshold_critical" {
description = "Elasticache eviction growing critical threshold in percentage" description = "Elasticache eviction growing critical threshold in percentage"
type = "string" type = string
default = 30 default = 30
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_elasticache" resource = "aws_elasticache"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,175 +1,170 @@
resource "datadog_monitor" "elasticache_eviction" { resource "datadog_monitor" "elasticache_eviction" {
count = "${var.eviction_enabled == "true" ? 1 : 0}" count = var.eviction_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache eviction {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache eviction {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}"
message = "${coalesce(var.eviction_message, var.message)}" message = coalesce(var.eviction_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.eviction_timeframe}): ( sum(${var.eviction_timeframe}): (
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) > ${var.eviction_threshold_critical} ) > ${var.eviction_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.eviction_threshold_warning}" warning = var.eviction_threshold_warning
critical = "${var.eviction_threshold_critical}" critical = var.eviction_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.eviction_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.eviction_extra_tags)
} }
resource "datadog_monitor" "elasticache_max_connection" { resource "datadog_monitor" "elasticache_max_connection" {
count = "${var.max_connection_enabled == "true" ? 1 : 0}" count = var.max_connection_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache max connections reached {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache max connections reached {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}"
message = "${coalesce(var.max_connection_message, var.message)}" message = coalesce(var.max_connection_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.max_connection_time_aggregator}(${var.max_connection_timeframe}): ( ${var.max_connection_time_aggregator}(${var.max_connection_timeframe}): (
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) >= 65000 ) >= 65000
EOQ EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.max_connection_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.max_connection_extra_tags)
} }
resource "datadog_monitor" "elasticache_no_connection" { resource "datadog_monitor" "elasticache_no_connection" {
count = "${var.no_connection_enabled == "true" ? 1 : 0}" count = var.no_connection_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache connections {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache connections {{#is_alert}}{{{comparator}}} {{threshold}} {{/is_alert}}"
message = "${coalesce(var.no_connection_message, var.message)}" message = coalesce(var.no_connection_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): ( ${var.no_connection_time_aggregator}(${var.no_connection_timeframe}): (
avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.curr_connections${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) <= 0 ) <= 0
EOQ EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.no_connection_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.no_connection_extra_tags)
} }
resource "datadog_monitor" "elasticache_swap" { resource "datadog_monitor" "elasticache_swap" {
count = "${var.swap_enabled == "true" ? 1 : 0}" count = var.swap_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache swap {{#is_alert}}{{{comparator}}} {{threshold}}MB ({{value}}MB){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}MB ({{value}}MB){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache swap {{#is_alert}}{{{comparator}}} {{threshold}}MB ({{value}}MB){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}MB ({{value}}MB){{/is_warning}}"
message = "${coalesce(var.swap_message, var.message)}" message = coalesce(var.swap_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
${var.swap_time_aggregator}(${var.swap_timeframe}): ( ${var.swap_time_aggregator}(${var.swap_timeframe}): (
avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.swap_usage${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) > ${var.swap_threshold_critical} ) > ${var.swap_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.swap_threshold_warning}" warning = var.swap_threshold_warning
critical = "${var.swap_threshold_critical}" critical = var.swap_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.swap_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.swap_extra_tags)
} }
resource "datadog_monitor" "elasticache_free_memory" { resource "datadog_monitor" "elasticache_free_memory" {
count = "${var.free_memory_enabled == "true" ? 1 : 0}" count = var.free_memory_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache free memory {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_memory_message, var.message)}" message = coalesce(var.free_memory_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}): pct_change(avg(${var.free_memory_timeframe}),${var.free_memory_condition_timeframe}):
avg:aws.elasticache.freeable_memory${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.freeable_memory${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
< ${var.free_memory_threshold_critical} < ${var.free_memory_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.free_memory_threshold_warning}" warning = var.free_memory_threshold_warning
critical = "${var.free_memory_threshold_critical}" critical = var.free_memory_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.free_memory_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.free_memory_extra_tags)
} }
resource "datadog_monitor" "elasticache_eviction_growing" { resource "datadog_monitor" "elasticache_eviction_growing" {
count = "${var.eviction_growing_enabled == "true" ? 1 : 0}" count = var.eviction_growing_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache evictions is growing {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache evictions is growing {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = "${coalesce(var.eviction_growing_message, var.message)}" message = coalesce(var.eviction_growing_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
pct_change(avg(${var.eviction_growing_timeframe}),${var.eviction_growing_condition_timeframe}): pct_change(avg(${var.eviction_growing_timeframe}),${var.eviction_growing_condition_timeframe}):
avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.evictions${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
> ${var.eviction_growing_threshold_critical} > ${var.eviction_growing_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.eviction_growing_threshold_warning}" warning = var.eviction_growing_threshold_warning
critical = "${var.eviction_growing_threshold_critical}" critical = var.eviction_growing_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform", "${var.eviction_growing_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache", "team:claranet", "created-by:terraform"], var.eviction_growing_extra_tags)
} }

View File

@ -1,29 +1,30 @@
output "elasticache_eviction_id" { output "elasticache_eviction_id" {
description = "id for monitor elasticache_eviction" description = "id for monitor elasticache_eviction"
value = "${datadog_monitor.elasticache_eviction.*.id}" value = datadog_monitor.elasticache_eviction.*.id
} }
output "elasticache_max_connection_id" { output "elasticache_max_connection_id" {
description = "id for monitor elasticache_max_connection" description = "id for monitor elasticache_max_connection"
value = "${datadog_monitor.elasticache_max_connection.*.id}" value = datadog_monitor.elasticache_max_connection.*.id
} }
output "elasticache_no_connection_id" { output "elasticache_no_connection_id" {
description = "id for monitor elasticache_no_connection" description = "id for monitor elasticache_no_connection"
value = "${datadog_monitor.elasticache_no_connection.*.id}" value = datadog_monitor.elasticache_no_connection.*.id
} }
output "elasticache_swap_id" { output "elasticache_swap_id" {
description = "id for monitor elasticache_swap" description = "id for monitor elasticache_swap"
value = "${datadog_monitor.elasticache_swap.*.id}" value = datadog_monitor.elasticache_swap.*.id
} }
output "elasticache_free_memory_id" { output "elasticache_free_memory_id" {
description = "id for monitor elasticache_free_memory" description = "id for monitor elasticache_free_memory"
value = "${datadog_monitor.elasticache_free_memory.*.id}" value = datadog_monitor.elasticache_free_memory.*.id
} }
output "elasticache_eviction_growing_id" { output "elasticache_eviction_growing_id" {
description = "id for monitor elasticache_eviction_growing" description = "id for monitor elasticache_eviction_growing"
value = "${datadog_monitor.elasticache_eviction_growing.*.id}" value = datadog_monitor.elasticache_eviction_growing.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-elasticache-memcached" { module "datadog-monitors-cloud-aws-elasticache-memcached" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/memcached?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/memcached?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -24,7 +24,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cpu\_high\_enabled | Flag to enable Elasticache memcached cpu high monitor | string | `"true"` | no | | cpu\_high\_enabled | Flag to enable Elasticache memcached cpu high monitor | string | `"true"` | no |
| cpu\_high\_extra\_tags | Extra tags for Elasticache memcached cpu high monitor | list | `[]` | no | | cpu\_high\_extra\_tags | Extra tags for Elasticache memcached cpu high monitor | list(string) | `[]` | no |
| cpu\_high\_message | Custom message for Elasticache memcached cpu high monitor | string | `""` | no | | cpu\_high\_message | Custom message for Elasticache memcached cpu high monitor | string | `""` | no |
| cpu\_high\_threshold\_critical | Elasticache memcached cpu high critical threshold in percentage | string | `"90"` | no | | cpu\_high\_threshold\_critical | Elasticache memcached cpu high critical threshold in percentage | string | `"90"` | no |
| cpu\_high\_threshold\_warning | Elasticache memcached cpu high warning threshold in percentage | string | `"75"` | no | | cpu\_high\_threshold\_warning | Elasticache memcached cpu high warning threshold in percentage | string | `"75"` | no |
@ -36,7 +36,7 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| get\_hits\_enabled | Flag to enable Elasticache memcached get hits monitor | string | `"true"` | no | | get\_hits\_enabled | Flag to enable Elasticache memcached get hits monitor | string | `"true"` | no |
| get\_hits\_extra\_tags | Extra tags for Elasticache memcached get hits monitor | list | `[]` | no | | get\_hits\_extra\_tags | Extra tags for Elasticache memcached get hits monitor | list(string) | `[]` | no |
| get\_hits\_message | Custom message for Elasticache memcached get hits monitor | string | `""` | no | | get\_hits\_message | Custom message for Elasticache memcached get hits monitor | string | `""` | no |
| get\_hits\_threshold\_critical | Elasticache memcached get hits critical threshold in percentage | string | `"60"` | no | | get\_hits\_threshold\_critical | Elasticache memcached get hits critical threshold in percentage | string | `"60"` | no |
| get\_hits\_threshold\_warning | Elasticache memcached get hits warning threshold in percentage | string | `"80"` | no | | get\_hits\_threshold\_warning | Elasticache memcached get hits warning threshold in percentage | string | `"80"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Infrastructure Environment" description = "Infrastructure Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,84 +43,85 @@ variable "filter_tags_custom_excluded" {
variable "get_hits_enabled" { variable "get_hits_enabled" {
description = "Flag to enable Elasticache memcached get hits monitor" description = "Flag to enable Elasticache memcached get hits monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "get_hits_extra_tags" { variable "get_hits_extra_tags" {
description = "Extra tags for Elasticache memcached get hits monitor" description = "Extra tags for Elasticache memcached get hits monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "get_hits_message" { variable "get_hits_message" {
description = "Custom message for Elasticache memcached get hits monitor" description = "Custom message for Elasticache memcached get hits monitor"
type = "string" type = string
default = "" default = ""
} }
variable "get_hits_time_aggregator" { variable "get_hits_time_aggregator" {
description = "Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg]" description = "Monitor aggregator for Elasticache memcached get hits [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "get_hits_timeframe" { variable "get_hits_timeframe" {
description = "Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache memcached get hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "get_hits_threshold_warning" { variable "get_hits_threshold_warning" {
description = "Elasticache memcached get hits warning threshold in percentage" description = "Elasticache memcached get hits warning threshold in percentage"
type = "string" type = string
default = 80 default = 80
} }
variable "get_hits_threshold_critical" { variable "get_hits_threshold_critical" {
description = "Elasticache memcached get hits critical threshold in percentage" description = "Elasticache memcached get hits critical threshold in percentage"
type = "string" type = string
default = 60 default = 60
} }
variable "cpu_high_enabled" { variable "cpu_high_enabled" {
description = "Flag to enable Elasticache memcached cpu high monitor" description = "Flag to enable Elasticache memcached cpu high monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cpu_high_extra_tags" { variable "cpu_high_extra_tags" {
description = "Extra tags for Elasticache memcached cpu high monitor" description = "Extra tags for Elasticache memcached cpu high monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cpu_high_message" { variable "cpu_high_message" {
description = "Custom message for Elasticache memcached cpu high monitor" description = "Custom message for Elasticache memcached cpu high monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cpu_high_time_aggregator" { variable "cpu_high_time_aggregator" {
description = "Monitor aggregator for Elasticache memcached cpu high [available values: min, max or avg]" description = "Monitor aggregator for Elasticache memcached cpu high [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "cpu_high_timeframe" { variable "cpu_high_timeframe" {
description = "Monitor timeframe for Elasticache memcached cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache memcached cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "cpu_high_threshold_warning" { variable "cpu_high_threshold_warning" {
description = "Elasticache memcached cpu high warning threshold in percentage" description = "Elasticache memcached cpu high warning threshold in percentage"
type = "string" type = string
default = 75 default = 75
} }
variable "cpu_high_threshold_critical" { variable "cpu_high_threshold_critical" {
description = "Elasticache memcached cpu high critical threshold in percentage" description = "Elasticache memcached cpu high critical threshold in percentage"
type = "string" type = string
default = 90 default = 90
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_elasticache" resource = "aws_elasticache"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,9 +1,8 @@
resource "datadog_monitor" "memcached_get_hits" { resource "datadog_monitor" "memcached_get_hits" {
count = "${var.get_hits_enabled == "true" ? 1 : 0}" count = var.get_hits_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache memcached cache hit ratio {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache memcached cache hit ratio {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.get_hits_message, var.message)}" message = coalesce(var.get_hits_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.get_hits_time_aggregator}(${var.get_hits_timeframe}): ( ${var.get_hits_time_aggregator}(${var.get_hits_timeframe}): (
@ -11,53 +10,53 @@ resource "datadog_monitor" "memcached_get_hits" {
default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) + default(avg:aws.elasticache.get_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0) +
default(avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0)) default(avg:aws.elasticache.get_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate(), 0))
) * 100 < ${var.get_hits_threshold_critical} ) * 100 < ${var.get_hits_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.get_hits_threshold_warning}" warning = var.get_hits_threshold_warning
critical = "${var.get_hits_threshold_critical}" critical = var.get_hits_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-memcached", "team:claranet", "created-by:terraform", "engine:memcached", "${var.get_hits_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-memcached", "team:claranet", "created-by:terraform", "engine:memcached"], var.get_hits_extra_tags)
} }
resource "datadog_monitor" "memcached_cpu_high" { resource "datadog_monitor" "memcached_cpu_high" {
count = "${var.cpu_high_enabled == "true" ? 1 : 0}" count = var.cpu_high_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache memcached CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache memcached CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_high_message, var.message)}" message = coalesce(var.cpu_high_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): ( ${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
avg:aws.elasticache.cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) > ${var.cpu_high_threshold_critical} ) > ${var.cpu_high_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.cpu_high_threshold_warning}" warning = var.cpu_high_threshold_warning
critical = "${var.cpu_high_threshold_critical}" critical = var.cpu_high_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-memcached", "team:claranet", "created-by:terraform", "engine:memcached", "${var.cpu_high_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-memcached", "team:claranet", "created-by:terraform", "engine:memcached"], var.cpu_high_extra_tags)
} }

View File

@ -1,9 +1,10 @@
output "memcached_get_hits_id" { output "memcached_get_hits_id" {
description = "id for monitor memcached_get_hits" description = "id for monitor memcached_get_hits"
value = "${datadog_monitor.memcached_get_hits.*.id}" value = datadog_monitor.memcached_get_hits.*.id
} }
output "memcached_cpu_high_id" { output "memcached_cpu_high_id" {
description = "id for monitor memcached_cpu_high" description = "id for monitor memcached_cpu_high"
value = "${datadog_monitor.memcached_cpu_high.*.id}" value = datadog_monitor.memcached_cpu_high.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-elasticache-redis" { module "datadog-monitors-cloud-aws-elasticache-redis" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/redis?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticache/redis?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -26,18 +26,18 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cache\_hits\_enabled | Flag to enable Elasticache redis cache hits monitor | string | `"true"` | no | | cache\_hits\_enabled | Flag to enable Elasticache redis cache hits monitor | string | `"true"` | no |
| cache\_hits\_extra\_tags | Extra tags for Elasticache redis cache hits monitor | list | `[]` | no | | cache\_hits\_extra\_tags | Extra tags for Elasticache redis cache hits monitor | list(string) | `[]` | no |
| cache\_hits\_message | Custom message for Elasticache redis cache hits monitor | string | `""` | no | | cache\_hits\_message | Custom message for Elasticache redis cache hits monitor | string | `""` | no |
| cache\_hits\_threshold\_critical | Elasticache redis cache hits critical threshold in percentage | string | `"60"` | no | | cache\_hits\_threshold\_critical | Elasticache redis cache hits critical threshold in percentage | string | `"60"` | no |
| cache\_hits\_threshold\_warning | Elasticache redis cache hits warning threshold in percentage | string | `"80"` | no | | cache\_hits\_threshold\_warning | Elasticache redis cache hits warning threshold in percentage | string | `"80"` | no |
| cache\_hits\_time\_aggregator | Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg] | string | `"max"` | no | | cache\_hits\_time\_aggregator | Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg] | string | `"max"` | no |
| cache\_hits\_timeframe | Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | cache\_hits\_timeframe | Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| commands\_enabled | Flag to enable Elasticache redis commands monitor | string | `"true"` | no | | commands\_enabled | Flag to enable Elasticache redis commands monitor | string | `"true"` | no |
| commands\_extra\_tags | Extra tags for Elasticache redis commands monitor | list | `[]` | no | | commands\_extra\_tags | Extra tags for Elasticache redis commands monitor | list(string) | `[]` | no |
| commands\_message | Custom message for Elasticache redis commands monitor | string | `""` | no | | commands\_message | Custom message for Elasticache redis commands monitor | string | `""` | no |
| commands\_timeframe | Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | commands\_timeframe | Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| cpu\_high\_enabled | Flag to enable Elasticache redis cpu high monitor | string | `"true"` | no | | cpu\_high\_enabled | Flag to enable Elasticache redis cpu high monitor | string | `"true"` | no |
| cpu\_high\_extra\_tags | Extra tags for Elasticache redis cpu high monitor | list | `[]` | no | | cpu\_high\_extra\_tags | Extra tags for Elasticache redis cpu high monitor | list(string) | `[]` | no |
| cpu\_high\_message | Custom message for Elasticache redis cpu high monitor | string | `""` | no | | cpu\_high\_message | Custom message for Elasticache redis cpu high monitor | string | `""` | no |
| cpu\_high\_threshold\_critical | Elasticache redis cpu high critical threshold in percentage | string | `"90"` | no | | cpu\_high\_threshold\_critical | Elasticache redis cpu high critical threshold in percentage | string | `"90"` | no |
| cpu\_high\_threshold\_warning | Elasticache redis cpu high warning threshold in percentage | string | `"75"` | no | | cpu\_high\_threshold\_warning | Elasticache redis cpu high warning threshold in percentage | string | `"75"` | no |
@ -52,7 +52,7 @@ Creates DataDog monitors with the following checks:
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| replication\_lag\_enabled | Flag to enable Elasticache redis replication lag monitor | string | `"true"` | no | | replication\_lag\_enabled | Flag to enable Elasticache redis replication lag monitor | string | `"true"` | no |
| replication\_lag\_extra\_tags | Extra tags for Elasticache redis replication lag monitor | list | `[]` | no | | replication\_lag\_extra\_tags | Extra tags for Elasticache redis replication lag monitor | list(string) | `[]` | no |
| replication\_lag\_message | Custom message for Elasticache redis replication lag monitor | string | `""` | no | | replication\_lag\_message | Custom message for Elasticache redis replication lag monitor | string | `""` | no |
| replication\_lag\_threshold\_critical | Elasticache redis replication lag critical threshold in seconds | string | `"180"` | no | | replication\_lag\_threshold\_critical | Elasticache redis replication lag critical threshold in seconds | string | `"180"` | no |
| replication\_lag\_threshold\_warning | Elasticache redis replication lag warning threshold in seconds | string | `"90"` | no | | replication\_lag\_threshold\_warning | Elasticache redis replication lag warning threshold in seconds | string | `"90"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Infrastructure Environment" description = "Infrastructure Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,150 +43,151 @@ variable "filter_tags_custom_excluded" {
variable "cache_hits_enabled" { variable "cache_hits_enabled" {
description = "Flag to enable Elasticache redis cache hits monitor" description = "Flag to enable Elasticache redis cache hits monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cache_hits_extra_tags" { variable "cache_hits_extra_tags" {
description = "Extra tags for Elasticache redis cache hits monitor" description = "Extra tags for Elasticache redis cache hits monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cache_hits_message" { variable "cache_hits_message" {
description = "Custom message for Elasticache redis cache hits monitor" description = "Custom message for Elasticache redis cache hits monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cache_hits_time_aggregator" { variable "cache_hits_time_aggregator" {
description = "Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg]" description = "Monitor aggregator for Elasticache redis cache hits [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "cache_hits_timeframe" { variable "cache_hits_timeframe" {
description = "Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache redis cache hits [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "cache_hits_threshold_warning" { variable "cache_hits_threshold_warning" {
description = "Elasticache redis cache hits warning threshold in percentage" description = "Elasticache redis cache hits warning threshold in percentage"
type = "string" type = string
default = 80 default = 80
} }
variable "cache_hits_threshold_critical" { variable "cache_hits_threshold_critical" {
description = "Elasticache redis cache hits critical threshold in percentage" description = "Elasticache redis cache hits critical threshold in percentage"
type = "string" type = string
default = 60 default = 60
} }
variable "cpu_high_enabled" { variable "cpu_high_enabled" {
description = "Flag to enable Elasticache redis cpu high monitor" description = "Flag to enable Elasticache redis cpu high monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cpu_high_extra_tags" { variable "cpu_high_extra_tags" {
description = "Extra tags for Elasticache redis cpu high monitor" description = "Extra tags for Elasticache redis cpu high monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cpu_high_message" { variable "cpu_high_message" {
description = "Custom message for Elasticache redis cpu high monitor" description = "Custom message for Elasticache redis cpu high monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cpu_high_time_aggregator" { variable "cpu_high_time_aggregator" {
description = "Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg]" description = "Monitor aggregator for Elasticache redis cpu high [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "cpu_high_timeframe" { variable "cpu_high_timeframe" {
description = "Monitor timeframe for Elasticache redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache redis cpu high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
variable "cpu_high_threshold_warning" { variable "cpu_high_threshold_warning" {
description = "Elasticache redis cpu high warning threshold in percentage" description = "Elasticache redis cpu high warning threshold in percentage"
type = "string" type = string
default = 75 default = 75
} }
variable "cpu_high_threshold_critical" { variable "cpu_high_threshold_critical" {
description = "Elasticache redis cpu high critical threshold in percentage" description = "Elasticache redis cpu high critical threshold in percentage"
type = "string" type = string
default = 90 default = 90
} }
variable "replication_lag_enabled" { variable "replication_lag_enabled" {
description = "Flag to enable Elasticache redis replication lag monitor" description = "Flag to enable Elasticache redis replication lag monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "replication_lag_extra_tags" { variable "replication_lag_extra_tags" {
description = "Extra tags for Elasticache redis replication lag monitor" description = "Extra tags for Elasticache redis replication lag monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "replication_lag_message" { variable "replication_lag_message" {
description = "Custom message for Elasticache redis replication lag monitor" description = "Custom message for Elasticache redis replication lag monitor"
type = "string" type = string
default = "" default = ""
} }
variable "replication_lag_time_aggregator" { variable "replication_lag_time_aggregator" {
description = "Monitor aggregator for Elasticache redis replication lag [available values: min, max or avg]" description = "Monitor aggregator for Elasticache redis replication lag [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "replication_lag_timeframe" { variable "replication_lag_timeframe" {
description = "Monitor timeframe for Elasticache redis replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache redis replication lag [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_10m" default = "last_10m"
} }
variable "replication_lag_threshold_warning" { variable "replication_lag_threshold_warning" {
description = "Elasticache redis replication lag warning threshold in seconds" description = "Elasticache redis replication lag warning threshold in seconds"
type = "string" type = string
default = 90 default = 90
} }
variable "replication_lag_threshold_critical" { variable "replication_lag_threshold_critical" {
description = "Elasticache redis replication lag critical threshold in seconds" description = "Elasticache redis replication lag critical threshold in seconds"
type = "string" type = string
default = 180 default = 180
} }
variable "commands_enabled" { variable "commands_enabled" {
description = "Flag to enable Elasticache redis commands monitor" description = "Flag to enable Elasticache redis commands monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "commands_extra_tags" { variable "commands_extra_tags" {
description = "Extra tags for Elasticache redis commands monitor" description = "Extra tags for Elasticache redis commands monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "commands_message" { variable "commands_message" {
description = "Custom message for Elasticache redis commands monitor" description = "Custom message for Elasticache redis commands monitor"
type = "string" type = string
default = "" default = ""
} }
variable "commands_timeframe" { variable "commands_timeframe" {
description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Elasticache redis commands [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_elasticache" resource = "aws_elasticache"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,9 +1,8 @@
resource "datadog_monitor" "redis_cache_hits" { resource "datadog_monitor" "redis_cache_hits" {
count = "${var.cache_hits_enabled == "true" ? 1 : 0}" count = var.cache_hits_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis cache hit ratio {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis cache hit ratio {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cache_hits_message, var.message)}" message = coalesce(var.cache_hits_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.cache_hits_time_aggregator}(${var.cache_hits_timeframe}): default( ${var.cache_hits_time_aggregator}(${var.cache_hits_timeframe}): default(
@ -11,106 +10,104 @@ resource "datadog_monitor" "redis_cache_hits" {
avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() + avg:aws.elasticache.cache_hits${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate() +
avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate()) avg:aws.elasticache.cache_misses${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_rate())
* 100, 100) < ${var.cache_hits_threshold_critical} * 100, 100) < ${var.cache_hits_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.cache_hits_threshold_warning}" warning = var.cache_hits_threshold_warning
critical = "${var.cache_hits_threshold_critical}" critical = var.cache_hits_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis", "${var.cache_hits_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis"], var.cache_hits_extra_tags)
} }
resource "datadog_monitor" "redis_cpu_high" { resource "datadog_monitor" "redis_cpu_high" {
count = "${var.cpu_high_enabled == "true" ? 1 : 0}" count = var.cpu_high_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis CPU {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_high_message, var.message)}" message = coalesce(var.cpu_high_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): ( ${var.cpu_high_time_aggregator}(${var.cpu_high_timeframe}): (
avg:aws.elasticache.engine_cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.engine_cpuutilization${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) > ${var.cpu_high_threshold_critical} ) > ${var.cpu_high_threshold_critical}
EOQ EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis", "${var.cpu_high_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis"], var.cpu_high_extra_tags)
} }
resource "datadog_monitor" "redis_replication_lag" { resource "datadog_monitor" "redis_replication_lag" {
count = "${var.replication_lag_enabled == "true" ? 1 : 0}" count = var.replication_lag_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis replication lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis replication lag {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${coalesce(var.replication_lag_message, var.message)}" message = coalesce(var.replication_lag_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): ( ${var.replication_lag_time_aggregator}(${var.replication_lag_timeframe}): (
avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid} avg:aws.elasticache.replication_lag${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}
) > ${var.replication_lag_threshold_critical} ) > ${var.replication_lag_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.replication_lag_threshold_warning}" warning = var.replication_lag_threshold_warning
critical = "${var.replication_lag_threshold_critical}" critical = var.replication_lag_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis", "${var.replication_lag_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis"], var.replication_lag_extra_tags)
} }
resource "datadog_monitor" "redis_commands" { resource "datadog_monitor" "redis_commands" {
count = "${var.commands_enabled == "true" ? 1 : 0}" count = var.commands_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis is receiving no commands" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticache redis is receiving no commands"
message = "${coalesce(var.commands_message, var.message)}" message = coalesce(var.commands_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
sum(${var.commands_timeframe}): ( sum(${var.commands_timeframe}): (
avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() + avg:aws.elasticache.get_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() +
avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count() avg:aws.elasticache.set_type_cmds${module.filter-tags.query_alert} by {region,cacheclusterid,cachenodeid}.as_count()
) <= 0 ) <= 0
EOQ EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis", "${var.commands_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticache-redis", "team:claranet", "created-by:terraform", "engine:redis"], var.commands_extra_tags)
} }

View File

@ -1,19 +1,20 @@
output "redis_cache_hits_id" { output "redis_cache_hits_id" {
description = "id for monitor redis_cache_hits" description = "id for monitor redis_cache_hits"
value = "${datadog_monitor.redis_cache_hits.*.id}" value = datadog_monitor.redis_cache_hits.*.id
} }
output "redis_cpu_high_id" { output "redis_cpu_high_id" {
description = "id for monitor redis_cpu_high" description = "id for monitor redis_cpu_high"
value = "${datadog_monitor.redis_cpu_high.*.id}" value = datadog_monitor.redis_cpu_high.*.id
} }
output "redis_replication_lag_id" { output "redis_replication_lag_id" {
description = "id for monitor redis_replication_lag" description = "id for monitor redis_replication_lag"
value = "${datadog_monitor.redis_replication_lag.*.id}" value = datadog_monitor.redis_replication_lag.*.id
} }
output "redis_commands_id" { output "redis_commands_id" {
description = "id for monitor redis_commands" description = "id for monitor redis_commands"
value = "${datadog_monitor.redis_commands.*.id}" value = datadog_monitor.redis_commands.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-elasticsearch" { module "datadog-monitors-cloud-aws-elasticsearch" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticsearch?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elasticsearch?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
es_cluster_volume_size = 42 es_cluster_volume_size = 42
} }
@ -27,14 +27,14 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cpu\_enabled | Flag to enable ES cluster cpu monitor | string | `"true"` | no | | cpu\_enabled | Flag to enable ES cluster cpu monitor | string | `"true"` | no |
| cpu\_extra\_tags | Extra tags for ES cluster cpu monitor | list | `[]` | no | | cpu\_extra\_tags | Extra tags for ES cluster cpu monitor | list(string) | `[]` | no |
| cpu\_message | Custom message for ES cluster cpu monitor | string | `""` | no | | cpu\_message | Custom message for ES cluster cpu monitor | string | `""` | no |
| cpu\_threshold\_critical | CPU usage in percent (critical threshold) | string | `"90"` | no | | cpu\_threshold\_critical | CPU usage in percent (critical threshold) | string | `"90"` | no |
| cpu\_threshold\_warning | CPU usage in percent (warning threshold) | string | `"80"` | no | | cpu\_threshold\_warning | CPU usage in percent (warning threshold) | string | `"80"` | no |
| cpu\_time\_aggregator | Monitor aggregator for ES cluster cpu [available values: min, max or avg] | string | `"min"` | no | | cpu\_time\_aggregator | Monitor aggregator for ES cluster cpu [available values: min, max or avg] | string | `"min"` | no |
| cpu\_timeframe | Monitor timeframe for ES cluster cpu [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | cpu\_timeframe | Monitor timeframe for ES cluster cpu [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| diskspace\_enabled | Flag to enable ES cluster diskspace monitor | string | `"true"` | no | | diskspace\_enabled | Flag to enable ES cluster diskspace monitor | string | `"true"` | no |
| diskspace\_extra\_tags | Extra tags for ES cluster diskspace monitor | list | `[]` | no | | diskspace\_extra\_tags | Extra tags for ES cluster diskspace monitor | list(string) | `[]` | no |
| diskspace\_message | Custom message for ES cluster diskspace monitor | string | `""` | no | | diskspace\_message | Custom message for ES cluster diskspace monitor | string | `""` | no |
| diskspace\_threshold\_critical | Disk free space in percent (critical threshold) | string | `"10"` | no | | diskspace\_threshold\_critical | Disk free space in percent (critical threshold) | string | `"10"` | no |
| diskspace\_threshold\_warning | Disk free space in percent (warning threshold) | string | `"20"` | no | | diskspace\_threshold\_warning | Disk free space in percent (warning threshold) | string | `"20"` | no |
@ -42,7 +42,7 @@ Creates DataDog monitors with the following checks:
| diskspace\_timeframe | Monitor timeframe for ES cluster diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | diskspace\_timeframe | Monitor timeframe for ES cluster diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| environment | Architecture Environment | string | n/a | yes | | environment | Architecture Environment | string | n/a | yes |
| es\_cluster\_status\_enabled | Flag to enable ES cluster status monitor | string | `"true"` | no | | es\_cluster\_status\_enabled | Flag to enable ES cluster status monitor | string | `"true"` | no |
| es\_cluster\_status\_extra\_tags | Extra tags for ES cluster status monitor | list | `[]` | no | | es\_cluster\_status\_extra\_tags | Extra tags for ES cluster status monitor | list(string) | `[]` | no |
| es\_cluster\_status\_message | Custom message for ES cluster status monitor | string | `""` | no | | es\_cluster\_status\_message | Custom message for ES cluster status monitor | string | `""` | no |
| es\_cluster\_status\_timeframe | Monitor timeframe for ES cluster status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_30m"` | no | | es\_cluster\_status\_timeframe | Monitor timeframe for ES cluster status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_30m"` | no |
| es\_cluster\_volume\_size | ElasticSearch Domain volume size (in GB) | string | n/a | yes | | es\_cluster\_volume\_size | ElasticSearch Domain volume size (in GB) | string | n/a | yes |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,25 +43,25 @@ variable "filter_tags_custom_excluded" {
variable "es_cluster_status_enabled" { variable "es_cluster_status_enabled" {
description = "Flag to enable ES cluster status monitor" description = "Flag to enable ES cluster status monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "es_cluster_status_extra_tags" { variable "es_cluster_status_extra_tags" {
description = "Extra tags for ES cluster status monitor" description = "Extra tags for ES cluster status monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "es_cluster_status_message" { variable "es_cluster_status_message" {
description = "Custom message for ES cluster status monitor" description = "Custom message for ES cluster status monitor"
type = "string" type = string
default = "" default = ""
} }
variable "es_cluster_status_timeframe" { variable "es_cluster_status_timeframe" {
description = "Monitor timeframe for ES cluster status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ES cluster status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_30m" default = "last_30m"
} }
@ -71,31 +71,31 @@ variable "es_cluster_volume_size" {
variable "diskspace_enabled" { variable "diskspace_enabled" {
description = "Flag to enable ES cluster diskspace monitor" description = "Flag to enable ES cluster diskspace monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "diskspace_extra_tags" { variable "diskspace_extra_tags" {
description = "Extra tags for ES cluster diskspace monitor" description = "Extra tags for ES cluster diskspace monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "diskspace_message" { variable "diskspace_message" {
description = "Custom message for ES cluster diskspace monitor" description = "Custom message for ES cluster diskspace monitor"
type = "string" type = string
default = "" default = ""
} }
variable "diskspace_time_aggregator" { variable "diskspace_time_aggregator" {
description = "Monitor aggregator for ES cluster diskspace [available values: min, max or avg]" description = "Monitor aggregator for ES cluster diskspace [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "diskspace_timeframe" { variable "diskspace_timeframe" {
description = "Monitor timeframe for ES cluster diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ES cluster diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -111,31 +111,31 @@ variable "diskspace_threshold_critical" {
variable "cpu_enabled" { variable "cpu_enabled" {
description = "Flag to enable ES cluster cpu monitor" description = "Flag to enable ES cluster cpu monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cpu_extra_tags" { variable "cpu_extra_tags" {
description = "Extra tags for ES cluster cpu monitor" description = "Extra tags for ES cluster cpu monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cpu_message" { variable "cpu_message" {
description = "Custom message for ES cluster cpu monitor" description = "Custom message for ES cluster cpu monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cpu_time_aggregator" { variable "cpu_time_aggregator" {
description = "Monitor aggregator for ES cluster cpu [available values: min, max or avg]" description = "Monitor aggregator for ES cluster cpu [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "cpu_timeframe" { variable "cpu_timeframe" {
description = "Monitor timeframe for ES cluster cpu [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ES cluster cpu [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -148,3 +148,4 @@ variable "cpu_threshold_critical" {
description = "CPU usage in percent (critical threshold)" description = "CPU usage in percent (critical threshold)"
default = "90" default = "90"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_elasticsearch" resource = "aws_elasticsearch"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -4,98 +4,96 @@
- If aws.es.cluster_statusyellow is 1 --> 1 < query value (=1.1) < 2 : warning - If aws.es.cluster_statusyellow is 1 --> 1 < query value (=1.1) < 2 : warning
Workaround : in the query, we add "0.1" to the result and we use the comparator ">=". No alert was triggered without that. */ Workaround : in the query, we add "0.1" to the result and we use the comparator ">=". No alert was triggered without that. */
resource "datadog_monitor" "es_cluster_status" { resource "datadog_monitor" "es_cluster_status" {
count = "${var.es_cluster_status_enabled == "true" ? 1 : 0}" count = var.es_cluster_status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster status is not green" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster status is not green"
message = "${coalesce(var.es_cluster_status_message, var.message)}" message = coalesce(var.es_cluster_status_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
max(${var.es_cluster_status_timeframe}): ( max(${var.es_cluster_status_timeframe}): (
avg:aws.es.cluster_statusred${module.filter-tags.query_alert} by {region,name} * 2 + avg:aws.es.cluster_statusred${module.filter-tags.query_alert} by {region,name} * 2 +
(avg:aws.es.cluster_statusyellow${module.filter-tags.query_alert} by {region,name} + 0.1) (avg:aws.es.cluster_statusyellow${module.filter-tags.query_alert} by {region,name} + 0.1)
) >= 2 ) >= 2
EOQ EOQ
thresholds { thresholds = {
warning = 1 warning = 1
critical = 2 critical = 2
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform", "${var.es_cluster_status_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.es_cluster_status_extra_tags)
} }
### Elasticsearch cluster free storage space monitor ### ### Elasticsearch cluster free storage space monitor ###
resource "datadog_monitor" "es_free_space_low" { resource "datadog_monitor" "es_free_space_low" {
count = "${var.diskspace_enabled == "true" ? 1 : 0}" count = var.diskspace_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster free storage space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.diskspace_message, var.message)}" message = coalesce(var.diskspace_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): ( ${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
avg:aws.es.free_storage_space${module.filter-tags.query_alert} by {region,name} / avg:aws.es.free_storage_space${module.filter-tags.query_alert} by {region,name} /
(${var.es_cluster_volume_size}*1000) * 100 (${var.es_cluster_volume_size}*1000) * 100
) < ${var.diskspace_threshold_critical} ) < ${var.diskspace_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.diskspace_threshold_warning}" warning = var.diskspace_threshold_warning
critical = "${var.diskspace_threshold_critical}" critical = var.diskspace_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform", "${var.diskspace_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.diskspace_extra_tags)
} }
### Elasticsearch cluster CPU monitor ### ### Elasticsearch cluster CPU monitor ###
resource "datadog_monitor" "es_cpu_90_15min" { resource "datadog_monitor" "es_cpu_90_15min" {
count = "${var.cpu_enabled == "true" ? 1 : 0}" count = var.cpu_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch cluster CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_message, var.message)}" message = coalesce(var.cpu_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
${var.cpu_time_aggregator}(${var.cpu_timeframe}): ( ${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name} avg:aws.es.cpuutilization${module.filter-tags.query_alert} by {region,name}
) > ${var.cpu_threshold_critical} ) > ${var.cpu_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.cpu_threshold_warning}" warning = var.cpu_threshold_warning
critical = "${var.cpu_threshold_critical}" critical = var.cpu_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform", "${var.cpu_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.cpu_extra_tags)
} }

View File

@ -1,14 +1,15 @@
output "es_cluster_status_id" { output "es_cluster_status_id" {
description = "id for monitor es_cluster_status" description = "id for monitor es_cluster_status"
value = "${datadog_monitor.es_cluster_status.*.id}" value = datadog_monitor.es_cluster_status.*.id
} }
output "es_free_space_low_id" { output "es_free_space_low_id" {
description = "id for monitor es_free_space_low" description = "id for monitor es_free_space_low"
value = "${datadog_monitor.es_free_space_low.*.id}" value = datadog_monitor.es_free_space_low.*.id
} }
output "es_cpu_90_15min_id" { output "es_cpu_90_15min_id" {
description = "id for monitor es_cpu_90_15min" description = "id for monitor es_cpu_90_15min"
value = "${datadog_monitor.es_cpu_90_15min.*.id}" value = datadog_monitor.es_cpu_90_15min.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-elb" { module "datadog-monitors-cloud-aws-elb" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elb?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/elb?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -29,38 +29,38 @@ Creates DataDog monitors with the following checks:
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| artificial\_requests\_count | Number of false requests used to mitigate false positive in case of low trafic | string | `"5"` | no | | artificial\_requests\_count | Number of false requests used to mitigate false positive in case of low trafic | string | `"5"` | no |
| elb\_4xx\_enabled | Flag to enable ELB 4xx errors monitor | string | `"true"` | no | | elb\_4xx\_enabled | Flag to enable ELB 4xx errors monitor | string | `"true"` | no |
| elb\_4xx\_extra\_tags | Extra tags for ELB 4xx errors monitor | list | `[]` | no | | elb\_4xx\_extra\_tags | Extra tags for ELB 4xx errors monitor | list(string) | `[]` | no |
| elb\_4xx\_message | Custom message for ELB 4xx errors monitor | string | `""` | no | | elb\_4xx\_message | Custom message for ELB 4xx errors monitor | string | `""` | no |
| elb\_4xx\_threshold\_critical | loadbalancer 4xx critical threshold in percentage | string | `"10"` | no | | elb\_4xx\_threshold\_critical | loadbalancer 4xx critical threshold in percentage | string | `"10"` | no |
| elb\_4xx\_threshold\_warning | loadbalancer 4xx warning threshold in percentage | string | `"5"` | no | | elb\_4xx\_threshold\_warning | loadbalancer 4xx warning threshold in percentage | string | `"5"` | no |
| elb\_4xx\_timeframe | Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_4xx\_timeframe | Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| elb\_5xx\_enabled | Flag to enable ELB 5xx errors monitor | string | `"true"` | no | | elb\_5xx\_enabled | Flag to enable ELB 5xx errors monitor | string | `"true"` | no |
| elb\_5xx\_extra\_tags | Extra tags for ELB 5xx errors monitor | list | `[]` | no | | elb\_5xx\_extra\_tags | Extra tags for ELB 5xx errors monitor | list(string) | `[]` | no |
| elb\_5xx\_message | Custom message for ELB 5xx errors monitor | string | `""` | no | | elb\_5xx\_message | Custom message for ELB 5xx errors monitor | string | `""` | no |
| elb\_5xx\_threshold\_critical | loadbalancer 5xx critical threshold in percentage | string | `"10"` | no | | elb\_5xx\_threshold\_critical | loadbalancer 5xx critical threshold in percentage | string | `"10"` | no |
| elb\_5xx\_threshold\_warning | loadbalancer 5xx warning threshold in percentage | string | `"5"` | no | | elb\_5xx\_threshold\_warning | loadbalancer 5xx warning threshold in percentage | string | `"5"` | no |
| elb\_5xx\_timeframe | Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_5xx\_timeframe | Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| elb\_backend\_4xx\_enabled | Flag to enable ELB backend 4xx errors monitor | string | `"true"` | no | | elb\_backend\_4xx\_enabled | Flag to enable ELB backend 4xx errors monitor | string | `"true"` | no |
| elb\_backend\_4xx\_extra\_tags | Extra tags for ELB backend 4xx errors monitor | list | `[]` | no | | elb\_backend\_4xx\_extra\_tags | Extra tags for ELB backend 4xx errors monitor | list(string) | `[]` | no |
| elb\_backend\_4xx\_message | Custom message for ELB backend 4xx errors monitor | string | `""` | no | | elb\_backend\_4xx\_message | Custom message for ELB backend 4xx errors monitor | string | `""` | no |
| elb\_backend\_4xx\_threshold\_critical | loadbalancer backend 4xx critical threshold in percentage | string | `"10"` | no | | elb\_backend\_4xx\_threshold\_critical | loadbalancer backend 4xx critical threshold in percentage | string | `"10"` | no |
| elb\_backend\_4xx\_threshold\_warning | loadbalancer backend 4xx warning threshold in percentage | string | `"5"` | no | | elb\_backend\_4xx\_threshold\_warning | loadbalancer backend 4xx warning threshold in percentage | string | `"5"` | no |
| elb\_backend\_4xx\_timeframe | Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_backend\_4xx\_timeframe | Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| elb\_backend\_5xx\_enabled | Flag to enable ELB backend 5xx errors monitor | string | `"true"` | no | | elb\_backend\_5xx\_enabled | Flag to enable ELB backend 5xx errors monitor | string | `"true"` | no |
| elb\_backend\_5xx\_extra\_tags | Extra tags for ELB backend 5xx errors monitor | list | `[]` | no | | elb\_backend\_5xx\_extra\_tags | Extra tags for ELB backend 5xx errors monitor | list(string) | `[]` | no |
| elb\_backend\_5xx\_message | Custom message for ELB backend 5xx errors monitor | string | `""` | no | | elb\_backend\_5xx\_message | Custom message for ELB backend 5xx errors monitor | string | `""` | no |
| elb\_backend\_5xx\_threshold\_critical | loadbalancer backend 5xx critical threshold in percentage | string | `"10"` | no | | elb\_backend\_5xx\_threshold\_critical | loadbalancer backend 5xx critical threshold in percentage | string | `"10"` | no |
| elb\_backend\_5xx\_threshold\_warning | loadbalancer backend 5xx warning threshold in percentage | string | `"5"` | no | | elb\_backend\_5xx\_threshold\_warning | loadbalancer backend 5xx warning threshold in percentage | string | `"5"` | no |
| elb\_backend\_5xx\_timeframe | Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_backend\_5xx\_timeframe | Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| elb\_backend\_latency\_critical | latency critical threshold in seconds | string | `"5"` | no | | elb\_backend\_latency\_critical | latency critical threshold in seconds | string | `"5"` | no |
| elb\_backend\_latency\_enabled | Flag to enable ELB backend latency monitor | string | `"true"` | no | | elb\_backend\_latency\_enabled | Flag to enable ELB backend latency monitor | string | `"true"` | no |
| elb\_backend\_latency\_extra\_tags | Extra tags for ELB backend latency monitor | list | `[]` | no | | elb\_backend\_latency\_extra\_tags | Extra tags for ELB backend latency monitor | list(string) | `[]` | no |
| elb\_backend\_latency\_message | Custom message for ELB backend latency monitor | string | `""` | no | | elb\_backend\_latency\_message | Custom message for ELB backend latency monitor | string | `""` | no |
| elb\_backend\_latency\_time\_aggregator | Monitor aggregator for ELB backend latency [available values: min, max or avg] | string | `"min"` | no | | elb\_backend\_latency\_time\_aggregator | Monitor aggregator for ELB backend latency [available values: min, max or avg] | string | `"min"` | no |
| elb\_backend\_latency\_timeframe | Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_backend\_latency\_timeframe | Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| elb\_backend\_latency\_warning | latency warning threshold in seconds | string | `"1"` | no | | elb\_backend\_latency\_warning | latency warning threshold in seconds | string | `"1"` | no |
| elb\_no\_healthy\_instance\_enabled | Flag to enable ELB no healty instance monitor | string | `"true"` | no | | elb\_no\_healthy\_instance\_enabled | Flag to enable ELB no healty instance monitor | string | `"true"` | no |
| elb\_no\_healthy\_instance\_extra\_tags | Extra tags for ELB no healty instance monitor | list | `[]` | no | | elb\_no\_healthy\_instance\_extra\_tags | Extra tags for ELB no healty instance monitor | list(string) | `[]` | no |
| elb\_no\_healthy\_instance\_message | Custom message for ELB no healty instance monitor | string | `""` | no | | elb\_no\_healthy\_instance\_message | Custom message for ELB no healty instance monitor | string | `""` | no |
| elb\_no\_healthy\_instance\_time\_aggregator | Monitor aggregator for ELB no healty instance [available values: min or max] | string | `"min"` | no | | elb\_no\_healthy\_instance\_time\_aggregator | Monitor aggregator for ELB no healty instance [available values: min or max] | string | `"min"` | no |
| elb\_no\_healthy\_instance\_timeframe | Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | elb\_no\_healthy\_instance\_timeframe | Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,55 +43,55 @@ variable "filter_tags_custom_excluded" {
variable "elb_no_healthy_instance_enabled" { variable "elb_no_healthy_instance_enabled" {
description = "Flag to enable ELB no healty instance monitor" description = "Flag to enable ELB no healty instance monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_no_healthy_instance_extra_tags" { variable "elb_no_healthy_instance_extra_tags" {
description = "Extra tags for ELB no healty instance monitor" description = "Extra tags for ELB no healty instance monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_no_healthy_instance_message" { variable "elb_no_healthy_instance_message" {
description = "Custom message for ELB no healty instance monitor" description = "Custom message for ELB no healty instance monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_no_healthy_instance_time_aggregator" { variable "elb_no_healthy_instance_time_aggregator" {
description = "Monitor aggregator for ELB no healty instance [available values: min or max]" description = "Monitor aggregator for ELB no healty instance [available values: min or max]"
type = "string" type = string
default = "min" default = "min"
} }
variable "elb_no_healthy_instance_timeframe" { variable "elb_no_healthy_instance_timeframe" {
description = "Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB no healty instance [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "elb_4xx_enabled" { variable "elb_4xx_enabled" {
description = "Flag to enable ELB 4xx errors monitor" description = "Flag to enable ELB 4xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_4xx_extra_tags" { variable "elb_4xx_extra_tags" {
description = "Extra tags for ELB 4xx errors monitor" description = "Extra tags for ELB 4xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_4xx_message" { variable "elb_4xx_message" {
description = "Custom message for ELB 4xx errors monitor" description = "Custom message for ELB 4xx errors monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_4xx_timeframe" { variable "elb_4xx_timeframe" {
description = "Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -107,25 +107,25 @@ variable "elb_4xx_threshold_critical" {
variable "elb_5xx_enabled" { variable "elb_5xx_enabled" {
description = "Flag to enable ELB 5xx errors monitor" description = "Flag to enable ELB 5xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_5xx_extra_tags" { variable "elb_5xx_extra_tags" {
description = "Extra tags for ELB 5xx errors monitor" description = "Extra tags for ELB 5xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_5xx_message" { variable "elb_5xx_message" {
description = "Custom message for ELB 5xx errors monitor" description = "Custom message for ELB 5xx errors monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_5xx_timeframe" { variable "elb_5xx_timeframe" {
description = "Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -141,25 +141,25 @@ variable "elb_5xx_threshold_critical" {
variable "elb_backend_4xx_enabled" { variable "elb_backend_4xx_enabled" {
description = "Flag to enable ELB backend 4xx errors monitor" description = "Flag to enable ELB backend 4xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_backend_4xx_extra_tags" { variable "elb_backend_4xx_extra_tags" {
description = "Extra tags for ELB backend 4xx errors monitor" description = "Extra tags for ELB backend 4xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_backend_4xx_message" { variable "elb_backend_4xx_message" {
description = "Custom message for ELB backend 4xx errors monitor" description = "Custom message for ELB backend 4xx errors monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_backend_4xx_timeframe" { variable "elb_backend_4xx_timeframe" {
description = "Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB backend 4xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -175,25 +175,25 @@ variable "elb_backend_4xx_threshold_critical" {
variable "elb_backend_5xx_enabled" { variable "elb_backend_5xx_enabled" {
description = "Flag to enable ELB backend 5xx errors monitor" description = "Flag to enable ELB backend 5xx errors monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_backend_5xx_extra_tags" { variable "elb_backend_5xx_extra_tags" {
description = "Extra tags for ELB backend 5xx errors monitor" description = "Extra tags for ELB backend 5xx errors monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_backend_5xx_message" { variable "elb_backend_5xx_message" {
description = "Custom message for ELB backend 5xx errors monitor" description = "Custom message for ELB backend 5xx errors monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_backend_5xx_timeframe" { variable "elb_backend_5xx_timeframe" {
description = "Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB backend 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -209,31 +209,31 @@ variable "elb_backend_5xx_threshold_critical" {
variable "elb_backend_latency_enabled" { variable "elb_backend_latency_enabled" {
description = "Flag to enable ELB backend latency monitor" description = "Flag to enable ELB backend latency monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "elb_backend_latency_extra_tags" { variable "elb_backend_latency_extra_tags" {
description = "Extra tags for ELB backend latency monitor" description = "Extra tags for ELB backend latency monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "elb_backend_latency_message" { variable "elb_backend_latency_message" {
description = "Custom message for ELB backend latency monitor" description = "Custom message for ELB backend latency monitor"
type = "string" type = string
default = "" default = ""
} }
variable "elb_backend_latency_time_aggregator" { variable "elb_backend_latency_time_aggregator" {
description = "Monitor aggregator for ELB backend latency [available values: min, max or avg]" description = "Monitor aggregator for ELB backend latency [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "elb_backend_latency_timeframe" { variable "elb_backend_latency_timeframe" {
description = "Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for ELB backend latency [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -251,3 +251,4 @@ variable "artificial_requests_count" {
default = 5 default = 5
description = "Number of false requests used to mitigate false positive in case of low trafic" description = "Number of false requests used to mitigate false positive in case of low trafic"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_elb" resource = "aws_elb"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,7 +1,8 @@
resource "datadog_monitor" "ELB_no_healthy_instances" { resource "datadog_monitor" "ELB_no_healthy_instances" {
count = "${var.elb_no_healthy_instance_enabled == "true" ? 1 : 0}" count = var.elb_no_healthy_instance_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%{{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB healthy instances {{#is_alert}}is at 0{{/is_alert}}{{#is_warning}}is at {{value}}%%{{/is_warning}}"
message = "${coalesce(var.elb_no_healthy_instance_message, var.message)}" message = coalesce(var.elb_no_healthy_instance_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): ( ${var.elb_no_healthy_instance_time_aggregator}(${var.elb_no_healthy_instance_timeframe}): (
@ -9,183 +10,177 @@ resource "datadog_monitor" "ELB_no_healthy_instances" {
sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} + sum:aws.elb.healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} +
sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} ) sum:aws.elb.un_healthy_host_count.maximum${module.filter-tags.query_alert} by {region,loadbalancername} )
) * 100 < 1 ) * 100 < 1
EOQ EOQ
type = "query alert" thresholds = {
thresholds {
critical = 1 critical = 1
warning = 100 warning = 100
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_no_healthy_instance_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_no_healthy_instance_extra_tags)
} }
resource "datadog_monitor" "ELB_too_much_4xx" { resource "datadog_monitor" "ELB_too_much_4xx" {
count = "${var.elb_4xx_enabled == "true" ? 1 : 0}" count = var.elb_4xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.elb_4xx_message, var.message)}" message = coalesce(var.elb_4xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.elb_4xx_timeframe}): sum(${var.elb_4xx_timeframe}):
default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / ( default(avg:aws.elb.httpcode_elb_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.elb_4xx_threshold_critical} * 100 > ${var.elb_4xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.elb_4xx_threshold_warning
thresholds { critical = var.elb_4xx_threshold_critical
warning = "${var.elb_4xx_threshold_warning}"
critical = "${var.elb_4xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_4xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_4xx_extra_tags)
} }
resource "datadog_monitor" "ELB_too_much_5xx" { resource "datadog_monitor" "ELB_too_much_5xx" {
count = "${var.elb_5xx_enabled == "true" ? 1 : 0}" count = var.elb_5xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.elb_5xx_message, var.message)}" message = coalesce(var.elb_5xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.elb_5xx_timeframe}): sum(${var.elb_5xx_timeframe}):
default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / ( default(avg:aws.elb.httpcode_elb_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.elb_5xx_threshold_critical} * 100 > ${var.elb_5xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.elb_5xx_threshold_warning
thresholds { critical = var.elb_5xx_threshold_critical
warning = "${var.elb_5xx_threshold_warning}"
critical = "${var.elb_5xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_5xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_5xx_extra_tags)
} }
resource "datadog_monitor" "ELB_too_much_4xx_backend" { resource "datadog_monitor" "ELB_too_much_4xx_backend" {
count = "${var.elb_backend_4xx_enabled == "true" ? 1 : 0}" count = var.elb_backend_4xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB backend 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.elb_backend_4xx_message, var.message)}" message = coalesce(var.elb_backend_4xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.elb_backend_4xx_timeframe}): sum(${var.elb_backend_4xx_timeframe}):
default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / ( default(avg:aws.elb.httpcode_backend_4xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.elb_backend_4xx_threshold_critical} * 100 > ${var.elb_backend_4xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.elb_backend_4xx_threshold_warning
thresholds { critical = var.elb_backend_4xx_threshold_critical
warning = "${var.elb_backend_4xx_threshold_warning}"
critical = "${var.elb_backend_4xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_backend_4xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_backend_4xx_extra_tags)
} }
resource "datadog_monitor" "ELB_too_much_5xx_backend" { resource "datadog_monitor" "ELB_too_much_5xx_backend" {
count = "${var.elb_backend_5xx_enabled == "true" ? 1 : 0}" count = var.elb_backend_5xx_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB backend 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.elb_backend_5xx_message, var.message)}" message = coalesce(var.elb_backend_5xx_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.elb_backend_5xx_timeframe}): sum(${var.elb_backend_5xx_timeframe}):
default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / ( default(avg:aws.elb.httpcode_backend_5xx${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate(), 0) / (
default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1)) default(avg:aws.elb.request_count${module.filter-tags.query_alert} by {region,loadbalancername}.as_rate() + ${var.artificial_requests_count}, 1))
* 100 > ${var.elb_backend_5xx_threshold_critical} * 100 > ${var.elb_backend_5xx_threshold_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.elb_backend_5xx_threshold_warning
thresholds { critical = var.elb_backend_5xx_threshold_critical
warning = "${var.elb_backend_5xx_threshold_warning}"
critical = "${var.elb_backend_5xx_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_backend_5xx_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_backend_5xx_extra_tags)
} }
resource "datadog_monitor" "ELB_backend_latency" { resource "datadog_monitor" "ELB_backend_latency" {
count = "${var.elb_backend_latency_enabled == "true" ? 1 : 0}" count = var.elb_backend_latency_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB latency too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ELB latency too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = "${coalesce(var.elb_backend_latency_message, var.message)}" message = coalesce(var.elb_backend_latency_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.elb_backend_latency_time_aggregator}(${var.elb_backend_latency_timeframe}): ${var.elb_backend_latency_time_aggregator}(${var.elb_backend_latency_timeframe}):
default(avg:aws.elb.latency${module.filter-tags.query_alert} by {region,loadbalancername}, 0) default(avg:aws.elb.latency${module.filter-tags.query_alert} by {region,loadbalancername}, 0)
> ${var.elb_backend_latency_critical} > ${var.elb_backend_latency_critical}
EOQ EOQ
type = "query alert" thresholds = {
warning = var.elb_backend_latency_warning
thresholds { critical = var.elb_backend_latency_critical
warning = "${var.elb_backend_latency_warning}"
critical = "${var.elb_backend_latency_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform", "${var.elb_backend_latency_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:elb", "team:claranet", "created-by:terraform"], var.elb_backend_latency_extra_tags)
} }

View File

@ -1,29 +1,30 @@
output "ELB_no_healthy_instances_id" { output "ELB_no_healthy_instances_id" {
description = "id for monitor ELB_no_healthy_instances" description = "id for monitor ELB_no_healthy_instances"
value = "${datadog_monitor.ELB_no_healthy_instances.*.id}" value = datadog_monitor.ELB_no_healthy_instances.*.id
} }
output "ELB_too_much_4xx_id" { output "ELB_too_much_4xx_id" {
description = "id for monitor ELB_too_much_4xx" description = "id for monitor ELB_too_much_4xx"
value = "${datadog_monitor.ELB_too_much_4xx.*.id}" value = datadog_monitor.ELB_too_much_4xx.*.id
} }
output "ELB_too_much_5xx_id" { output "ELB_too_much_5xx_id" {
description = "id for monitor ELB_too_much_5xx" description = "id for monitor ELB_too_much_5xx"
value = "${datadog_monitor.ELB_too_much_5xx.*.id}" value = datadog_monitor.ELB_too_much_5xx.*.id
} }
output "ELB_too_much_4xx_backend_id" { output "ELB_too_much_4xx_backend_id" {
description = "id for monitor ELB_too_much_4xx_backend" description = "id for monitor ELB_too_much_4xx_backend"
value = "${datadog_monitor.ELB_too_much_4xx_backend.*.id}" value = datadog_monitor.ELB_too_much_4xx_backend.*.id
} }
output "ELB_too_much_5xx_backend_id" { output "ELB_too_much_5xx_backend_id" {
description = "id for monitor ELB_too_much_5xx_backend" description = "id for monitor ELB_too_much_5xx_backend"
value = "${datadog_monitor.ELB_too_much_5xx_backend.*.id}" value = datadog_monitor.ELB_too_much_5xx_backend.*.id
} }
output "ELB_backend_latency_id" { output "ELB_backend_latency_id" {
description = "id for monitor ELB_backend_latency" description = "id for monitor ELB_backend_latency"
value = "${datadog_monitor.ELB_backend_latency.*.id}" value = datadog_monitor.ELB_backend_latency.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-kinesis-firehose" { module "datadog-monitors-cloud-aws-kinesis-firehose" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/kinesis-firehose?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/kinesis-firehose?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -28,7 +28,7 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| incoming\_records\_enabled | Flag to enable Kinesis Firehorse incoming records monitor | string | `"true"` | no | | incoming\_records\_enabled | Flag to enable Kinesis Firehorse incoming records monitor | string | `"true"` | no |
| incoming\_records\_extra\_tags | Extra tags for Kinesis Firehorse incoming records monitor | list | `[]` | no | | incoming\_records\_extra\_tags | Extra tags for Kinesis Firehorse incoming records monitor | list(string) | `[]` | no |
| incoming\_records\_message | Custom message for Kinesis Firehorse incoming records monitor | string | `""` | no | | incoming\_records\_message | Custom message for Kinesis Firehorse incoming records monitor | string | `""` | no |
| incoming\_records\_timeframe | Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | incoming\_records\_timeframe | Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| message | Message sent when an alert is triggered | string | n/a | yes | | message | Message sent when an alert is triggered | string | n/a | yes |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Environment" description = "Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,19 +43,19 @@ variable "filter_tags_custom_excluded" {
variable "incoming_records_enabled" { variable "incoming_records_enabled" {
description = "Flag to enable Kinesis Firehorse incoming records monitor" description = "Flag to enable Kinesis Firehorse incoming records monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "incoming_records_extra_tags" { variable "incoming_records_extra_tags" {
description = "Extra tags for Kinesis Firehorse incoming records monitor" description = "Extra tags for Kinesis Firehorse incoming records monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "incoming_records_message" { variable "incoming_records_message" {
description = "Custom message for Kinesis Firehorse incoming records monitor" description = "Custom message for Kinesis Firehorse incoming records monitor"
type = "string" type = string
default = "" default = ""
} }
@ -63,3 +63,4 @@ variable "incoming_records_timeframe" {
description = "Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for incoming records metrics evaluation [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
default = "last_15m" default = "last_15m"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_kinesis-firehose" resource = "aws_kinesis-firehose"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,30 +1,30 @@
### Kinesis Firehose Incoming records ### ### Kinesis Firehose Incoming records ###
resource "datadog_monitor" "firehose_incoming_records" { resource "datadog_monitor" "firehose_incoming_records" {
count = "${var.incoming_records_enabled == "true" ? 1 : 0}" count = var.incoming_records_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kinesis Firehose No incoming records" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kinesis Firehose No incoming records"
message = "${coalesce(var.incoming_records_message, var.message)}" message = coalesce(var.incoming_records_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
sum(${var.incoming_records_timeframe}): ( sum(${var.incoming_records_timeframe}): (
avg:aws.firehose.incoming_records${module.filter-tags.query_alert} by {region,deliverystreamname} avg:aws.firehose.incoming_records${module.filter-tags.query_alert} by {region,deliverystreamname}
) <= 0 ) <= 0
EOQ EOQ
thresholds { thresholds = {
critical = 0 critical = 0
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:kinesis-firehose", "team:claranet", "created-by:terraform", "${var.incoming_records_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:kinesis-firehose", "team:claranet", "created-by:terraform"], var.incoming_records_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "firehose_incoming_records_id" { output "firehose_incoming_records_id" {
description = "id for monitor firehose_incoming_records" description = "id for monitor firehose_incoming_records"
value = "${datadog_monitor.firehose_incoming_records.*.id}" value = datadog_monitor.firehose_incoming_records.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-rds-aurora-mysql" { module "datadog-monitors-cloud-aws-rds-aurora-mysql" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/aurora/mysql?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/aurora/mysql?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -23,7 +23,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| aurora\_replicalag\_enabled | Flag to enable RDS Aurora replica lag monitor | string | `"true"` | no | | aurora\_replicalag\_enabled | Flag to enable RDS Aurora replica lag monitor | string | `"true"` | no |
| aurora\_replicalag\_extra\_tags | Extra tags for RDS Aurora replica lag monitor | list | `[]` | no | | aurora\_replicalag\_extra\_tags | Extra tags for RDS Aurora replica lag monitor | list(string) | `[]` | no |
| aurora\_replicalag\_message | Custom message for RDS Aurora replica lag monitor | string | `""` | no | | aurora\_replicalag\_message | Custom message for RDS Aurora replica lag monitor | string | `""` | no |
| aurora\_replicalag\_threshold\_critical | Aurora replica lag in milliseconds (critical threshold) | string | `"200"` | no | | aurora\_replicalag\_threshold\_critical | Aurora replica lag in milliseconds (critical threshold) | string | `"200"` | no |
| aurora\_replicalag\_threshold\_warning | Aurora replica lag in milliseconds (warning threshold) | string | `"100"` | no | | aurora\_replicalag\_threshold\_warning | Aurora replica lag in milliseconds (warning threshold) | string | `"100"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,25 +43,25 @@ variable "filter_tags_custom_excluded" {
variable "aurora_replicalag_enabled" { variable "aurora_replicalag_enabled" {
description = "Flag to enable RDS Aurora replica lag monitor" description = "Flag to enable RDS Aurora replica lag monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "aurora_replicalag_extra_tags" { variable "aurora_replicalag_extra_tags" {
description = "Extra tags for RDS Aurora replica lag monitor" description = "Extra tags for RDS Aurora replica lag monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "aurora_replicalag_message" { variable "aurora_replicalag_message" {
description = "Custom message for RDS Aurora replica lag monitor" description = "Custom message for RDS Aurora replica lag monitor"
type = "string" type = string
default = "" default = ""
} }
variable "aurora_replicalag_timeframe" { variable "aurora_replicalag_timeframe" {
description = "Monitor timeframe for RDS Aurora replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for RDS Aurora replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -74,3 +74,4 @@ variable "aurora_replicalag_threshold_critical" {
description = "Aurora replica lag in milliseconds (critical threshold)" description = "Aurora replica lag in milliseconds (critical threshold)"
default = "200" default = "200"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../../common/filter-tags" source = "../../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_rds" resource = "aws_rds"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,30 +1,30 @@
### RDS Aurora Mysql Replica Lag monitor ### ### RDS Aurora Mysql Replica Lag monitor ###
resource "datadog_monitor" "rds_aurora_mysql_replica_lag" { resource "datadog_monitor" "rds_aurora_mysql_replica_lag" {
count = "${var.aurora_replicalag_enabled == "true" ? 1 : 0}" count = var.aurora_replicalag_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS Aurora Mysql replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS Aurora Mysql replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}"
message = "${coalesce(var.aurora_replicalag_message, var.message)}" message = coalesce(var.aurora_replicalag_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
avg(${var.aurora_replicalag_timeframe}): ( avg(${var.aurora_replicalag_timeframe}): (
avg:aws.rds.aurora_replica_lag${module.filter-tags.query_alert} by {region,name} avg:aws.rds.aurora_replica_lag${module.filter-tags.query_alert} by {region,name}
) > ${var.aurora_replicalag_threshold_critical} ) > ${var.aurora_replicalag_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.aurora_replicalag_threshold_warning}" warning = var.aurora_replicalag_threshold_warning
critical = "${var.aurora_replicalag_threshold_critical}" critical = var.aurora_replicalag_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds-aurora-mysql", "team:claranet", "created-by:terraform", "${var.aurora_replicalag_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds-aurora-mysql", "team:claranet", "created-by:terraform"], var.aurora_replicalag_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "rds_aurora_mysql_replica_lag_id" { output "rds_aurora_mysql_replica_lag_id" {
description = "id for monitor rds_aurora_mysql_replica_lag" description = "id for monitor rds_aurora_mysql_replica_lag"
value = "${datadog_monitor.rds_aurora_mysql_replica_lag.*.id}" value = datadog_monitor.rds_aurora_mysql_replica_lag.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-rds-aurora-postgresql" { module "datadog-monitors-cloud-aws-rds-aurora-postgresql" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/aurora/postgresql?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/aurora/postgresql?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -23,7 +23,7 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| aurora\_replicalag\_enabled | Flag to enable RDS Aurora replica lag monitor | string | `"true"` | no | | aurora\_replicalag\_enabled | Flag to enable RDS Aurora replica lag monitor | string | `"true"` | no |
| aurora\_replicalag\_extra\_tags | Extra tags for RDS Aurora replica lag monitor | list | `[]` | no | | aurora\_replicalag\_extra\_tags | Extra tags for RDS Aurora replica lag monitor | list(string) | `[]` | no |
| aurora\_replicalag\_message | Custom message for RDS Aurora replica lag monitor | string | `""` | no | | aurora\_replicalag\_message | Custom message for RDS Aurora replica lag monitor | string | `""` | no |
| aurora\_replicalag\_threshold\_critical | Aurora replica lag in milliseconds (critical threshold) | string | `"200"` | no | | aurora\_replicalag\_threshold\_critical | Aurora replica lag in milliseconds (critical threshold) | string | `"200"` | no |
| aurora\_replicalag\_threshold\_warning | Aurora replica lag in milliseconds (warning threshold) | string | `"100"` | no | | aurora\_replicalag\_threshold\_warning | Aurora replica lag in milliseconds (warning threshold) | string | `"100"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,25 +43,25 @@ variable "filter_tags_custom_excluded" {
variable "aurora_replicalag_enabled" { variable "aurora_replicalag_enabled" {
description = "Flag to enable RDS Aurora replica lag monitor" description = "Flag to enable RDS Aurora replica lag monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "aurora_replicalag_extra_tags" { variable "aurora_replicalag_extra_tags" {
description = "Extra tags for RDS Aurora replica lag monitor" description = "Extra tags for RDS Aurora replica lag monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "aurora_replicalag_message" { variable "aurora_replicalag_message" {
description = "Custom message for RDS Aurora replica lag monitor" description = "Custom message for RDS Aurora replica lag monitor"
type = "string" type = string
default = "" default = ""
} }
variable "aurora_replicalag_timeframe" { variable "aurora_replicalag_timeframe" {
description = "Monitor timeframe for RDS Aurora replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for RDS Aurora replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -74,3 +74,4 @@ variable "aurora_replicalag_threshold_critical" {
description = "Aurora replica lag in milliseconds (critical threshold)" description = "Aurora replica lag in milliseconds (critical threshold)"
default = "200" default = "200"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../../common/filter-tags" source = "../../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_rds" resource = "aws_rds"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,30 +1,30 @@
### RDS Aurora Postgresql Replica Lag monitor ### ### RDS Aurora Postgresql Replica Lag monitor ###
resource "datadog_monitor" "rds_aurora_postgresql_replica_lag" { resource "datadog_monitor" "rds_aurora_postgresql_replica_lag" {
count = "${var.aurora_replicalag_enabled == "true" ? 1 : 0}" count = var.aurora_replicalag_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS Aurora PostgreSQL replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS Aurora PostgreSQL replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}"
message = "${coalesce(var.aurora_replicalag_message, var.message)}" message = coalesce(var.aurora_replicalag_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
avg(${var.aurora_replicalag_timeframe}): ( avg(${var.aurora_replicalag_timeframe}): (
avg:aws.rds.rdsto_aurora_postgre_sqlreplica_lag${module.filter-tags.query_alert} by {region,name} avg:aws.rds.rdsto_aurora_postgre_sqlreplica_lag${module.filter-tags.query_alert} by {region,name}
) > ${var.aurora_replicalag_threshold_critical} ) > ${var.aurora_replicalag_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.aurora_replicalag_threshold_warning}" warning = var.aurora_replicalag_threshold_warning
critical = "${var.aurora_replicalag_threshold_critical}" critical = var.aurora_replicalag_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds-aurora-postgresql", "team:claranet", "created-by:terraform", "${var.aurora_replicalag_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds-aurora-postgresql", "team:claranet", "created-by:terraform"], var.aurora_replicalag_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "rds_aurora_postgresql_replica_lag_id" { output "rds_aurora_postgresql_replica_lag_id" {
description = "id for monitor rds_aurora_postgresql_replica_lag" description = "id for monitor rds_aurora_postgresql_replica_lag"
value = "${datadog_monitor.rds_aurora_postgresql_replica_lag.*.id}" value = datadog_monitor.rds_aurora_postgresql_replica_lag.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-rds-common" { module "datadog-monitors-cloud-aws-rds-common" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/common?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/rds/common?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -25,14 +25,14 @@ Creates DataDog monitors with the following checks:
| Name | Description | Type | Default | Required | | Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:| |------|-------------|:----:|:-----:|:-----:|
| cpu\_enabled | Flag to enable RDS CPU usage monitor | string | `"true"` | no | | cpu\_enabled | Flag to enable RDS CPU usage monitor | string | `"true"` | no |
| cpu\_extra\_tags | Extra tags for RDS CPU usage monitor | list | `[]` | no | | cpu\_extra\_tags | Extra tags for RDS CPU usage monitor | list(string) | `[]` | no |
| cpu\_message | Custom message for RDS CPU usage monitor | string | `""` | no | | cpu\_message | Custom message for RDS CPU usage monitor | string | `""` | no |
| cpu\_threshold\_critical | CPU usage in percent (critical threshold) | string | `"90"` | no | | cpu\_threshold\_critical | CPU usage in percent (critical threshold) | string | `"90"` | no |
| cpu\_threshold\_warning | CPU usage in percent (warning threshold) | string | `"80"` | no | | cpu\_threshold\_warning | CPU usage in percent (warning threshold) | string | `"80"` | no |
| cpu\_time\_aggregator | Monitor aggregator for RDS CPU usage [available values: min, max or avg] | string | `"min"` | no | | cpu\_time\_aggregator | Monitor aggregator for RDS CPU usage [available values: min, max or avg] | string | `"min"` | no |
| cpu\_timeframe | Monitor timeframe for RDS CPU usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | | cpu\_timeframe | Monitor timeframe for RDS CPU usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no |
| diskspace\_enabled | Flag to enable RDS free diskspace monitor | string | `"true"` | no | | diskspace\_enabled | Flag to enable RDS free diskspace monitor | string | `"true"` | no |
| diskspace\_extra\_tags | Extra tags for RDS free diskspace monitor | list | `[]` | no | | diskspace\_extra\_tags | Extra tags for RDS free diskspace monitor | list(string) | `[]` | no |
| diskspace\_message | Custom message for RDS free diskspace monitor | string | `""` | no | | diskspace\_message | Custom message for RDS free diskspace monitor | string | `""` | no |
| diskspace\_threshold\_critical | Disk free space in percent (critical threshold) | string | `"10"` | no | | diskspace\_threshold\_critical | Disk free space in percent (critical threshold) | string | `"10"` | no |
| diskspace\_threshold\_warning | Disk free space in percent (warning threshold) | string | `"20"` | no | | diskspace\_threshold\_warning | Disk free space in percent (warning threshold) | string | `"20"` | no |
@ -47,7 +47,7 @@ Creates DataDog monitors with the following checks:
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| replicalag\_enabled | Flag to enable RDS replica lag monitor | string | `"true"` | no | | replicalag\_enabled | Flag to enable RDS replica lag monitor | string | `"true"` | no |
| replicalag\_extra\_tags | Extra tags for RDS replica lag monitor | list | `[]` | no | | replicalag\_extra\_tags | Extra tags for RDS replica lag monitor | list(string) | `[]` | no |
| replicalag\_message | Custom message for RDS replica lag monitor | string | `""` | no | | replicalag\_message | Custom message for RDS replica lag monitor | string | `""` | no |
| replicalag\_threshold\_critical | replica lag in seconds (critical threshold) | string | `"300"` | no | | replicalag\_threshold\_critical | replica lag in seconds (critical threshold) | string | `"300"` | no |
| replicalag\_threshold\_warning | replica lag in seconds (warning threshold) | string | `"200"` | no | | replicalag\_threshold\_warning | replica lag in seconds (warning threshold) | string | `"200"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,31 +43,31 @@ variable "filter_tags_custom_excluded" {
variable "cpu_enabled" { variable "cpu_enabled" {
description = "Flag to enable RDS CPU usage monitor" description = "Flag to enable RDS CPU usage monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "cpu_extra_tags" { variable "cpu_extra_tags" {
description = "Extra tags for RDS CPU usage monitor" description = "Extra tags for RDS CPU usage monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "cpu_message" { variable "cpu_message" {
description = "Custom message for RDS CPU usage monitor" description = "Custom message for RDS CPU usage monitor"
type = "string" type = string
default = "" default = ""
} }
variable "cpu_time_aggregator" { variable "cpu_time_aggregator" {
description = "Monitor aggregator for RDS CPU usage [available values: min, max or avg]" description = "Monitor aggregator for RDS CPU usage [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "cpu_timeframe" { variable "cpu_timeframe" {
description = "Monitor timeframe for RDS CPU usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for RDS CPU usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -83,31 +83,31 @@ variable "cpu_threshold_critical" {
variable "diskspace_enabled" { variable "diskspace_enabled" {
description = "Flag to enable RDS free diskspace monitor" description = "Flag to enable RDS free diskspace monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "diskspace_extra_tags" { variable "diskspace_extra_tags" {
description = "Extra tags for RDS free diskspace monitor" description = "Extra tags for RDS free diskspace monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "diskspace_message" { variable "diskspace_message" {
description = "Custom message for RDS free diskspace monitor" description = "Custom message for RDS free diskspace monitor"
type = "string" type = string
default = "" default = ""
} }
variable "diskspace_time_aggregator" { variable "diskspace_time_aggregator" {
description = "Monitor aggregator for RDS free diskspace [available values: min, max or avg]" description = "Monitor aggregator for RDS free diskspace [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "diskspace_timeframe" { variable "diskspace_timeframe" {
description = "Monitor timeframe for RDS free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for RDS free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_15m" default = "last_15m"
} }
@ -123,25 +123,25 @@ variable "diskspace_threshold_critical" {
variable "replicalag_enabled" { variable "replicalag_enabled" {
description = "Flag to enable RDS replica lag monitor" description = "Flag to enable RDS replica lag monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "replicalag_extra_tags" { variable "replicalag_extra_tags" {
description = "Extra tags for RDS replica lag monitor" description = "Extra tags for RDS replica lag monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "replicalag_message" { variable "replicalag_message" {
description = "Custom message for RDS replica lag monitor" description = "Custom message for RDS replica lag monitor"
type = "string" type = string
default = "" default = ""
} }
variable "replicalag_timeframe" { variable "replicalag_timeframe" {
description = "Monitor timeframe for RDS replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for RDS replica lag monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -154,3 +154,4 @@ variable "replicalag_threshold_critical" {
description = "replica lag in seconds (critical threshold)" description = "replica lag in seconds (critical threshold)"
default = "300" default = "300"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../../common/filter-tags" source = "../../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "aws_rds" resource = "aws_rds"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,93 +1,91 @@
### RDS instance CPU monitor ### ### RDS instance CPU monitor ###
resource "datadog_monitor" "rds_cpu_90_15min" { resource "datadog_monitor" "rds_cpu_90_15min" {
count = "${var.cpu_enabled == "true" ? 1 : 0}" count = var.cpu_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS instance CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS instance CPU high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_message, var.message)}" message = coalesce(var.cpu_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.cpu_time_aggregator}(${var.cpu_timeframe}): ( ${var.cpu_time_aggregator}(${var.cpu_timeframe}): (
avg:aws.rds.cpuutilization${module.filter-tags.query_alert} by {region,name} avg:aws.rds.cpuutilization${module.filter-tags.query_alert} by {region,name}
) > ${var.cpu_threshold_critical} ) > ${var.cpu_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.cpu_threshold_warning}" warning = var.cpu_threshold_warning
critical = "${var.cpu_threshold_critical}" critical = var.cpu_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform", "${var.cpu_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform"], var.cpu_extra_tags)
} }
### RDS instance free space monitor ### ### RDS instance free space monitor ###
resource "datadog_monitor" "rds_free_space_low" { resource "datadog_monitor" "rds_free_space_low" {
count = "${var.diskspace_enabled == "true" ? 1 : 0}" count = var.diskspace_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS instance free space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS instance free space {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.diskspace_message, var.message)}" message = coalesce(var.diskspace_message, var.message)
type = "query alert"
type = "query alert"
query = <<EOQ query = <<EOQ
${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): ( ${var.diskspace_time_aggregator}(${var.diskspace_timeframe}): (
avg:aws.rds.free_storage_space${module.filter-tags.query_alert} by {region,name} / avg:aws.rds.free_storage_space${module.filter-tags.query_alert} by {region,name} /
avg:aws.rds.total_storage_space${module.filter-tags.query_alert} by {region,name} * 100 avg:aws.rds.total_storage_space${module.filter-tags.query_alert} by {region,name} * 100
) < ${var.diskspace_threshold_critical} ) < ${var.diskspace_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.diskspace_threshold_warning}" warning = var.diskspace_threshold_warning
critical = "${var.diskspace_threshold_critical}" critical = var.diskspace_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform", "${var.diskspace_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform"], var.diskspace_extra_tags)
} }
### RDS Replica Lag monitor ### ### RDS Replica Lag monitor ###
resource "datadog_monitor" "rds_replica_lag" { resource "datadog_monitor" "rds_replica_lag" {
count = "${var.replicalag_enabled == "true" ? 1 : 0}" count = var.replicalag_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] RDS replica lag {{#is_alert}}{{{comparator}}} {{threshold}} ms ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ms ({{value}}%){{/is_warning}}"
message = "${coalesce(var.replicalag_message, var.message)}" message = coalesce(var.replicalag_message, var.message)
type = "query alert"
type = "query alert" query = <<EOQ
query = <<EOQ
avg(${var.replicalag_timeframe}): ( avg(${var.replicalag_timeframe}): (
avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name} avg:aws.rds.replica_lag${module.filter-tags.query_alert} by {region,name}
) > ${var.replicalag_threshold_critical} ) > ${var.replicalag_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
warning = "${var.replicalag_threshold_warning}" warning = var.replicalag_threshold_warning
critical = "${var.replicalag_threshold_critical}" critical = var.replicalag_threshold_critical
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.evaluation_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform", "${var.replicalag_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:rds", "team:claranet", "created-by:terraform"], var.replicalag_extra_tags)
} }

View File

@ -1,14 +1,15 @@
output "rds_cpu_90_15min_id" { output "rds_cpu_90_15min_id" {
description = "id for monitor rds_cpu_90_15min" description = "id for monitor rds_cpu_90_15min"
value = "${datadog_monitor.rds_cpu_90_15min.*.id}" value = datadog_monitor.rds_cpu_90_15min.*.id
} }
output "rds_free_space_low_id" { output "rds_free_space_low_id" {
description = "id for monitor rds_free_space_low" description = "id for monitor rds_free_space_low"
value = "${datadog_monitor.rds_free_space_low.*.id}" value = datadog_monitor.rds_free_space_low.*.id
} }
output "rds_replica_lag_id" { output "rds_replica_lag_id" {
description = "id for monitor rds_replica_lag" description = "id for monitor rds_replica_lag"
value = "${datadog_monitor.rds_replica_lag.*.id}" value = datadog_monitor.rds_replica_lag.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-aws-vpn" { module "datadog-monitors-cloud-aws-vpn" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/vpn?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/vpn?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -29,7 +29,7 @@ Creates DataDog monitors with the following checks:
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| vpn\_status\_enabled | Flag to enable VPN status monitor | string | `"true"` | no | | vpn\_status\_enabled | Flag to enable VPN status monitor | string | `"true"` | no |
| vpn\_status\_extra\_tags | Extra tags for VPN status monitor | list | `[]` | no | | vpn\_status\_extra\_tags | Extra tags for VPN status monitor | list(string) | `[]` | no |
| vpn\_status\_message | Custom message for VPN status monitor | string | `""` | no | | vpn\_status\_message | Custom message for VPN status monitor | string | `""` | no |
| vpn\_status\_time\_aggregator | Monitor aggregator for VPN status [available values: min, max or avg] | string | `"max"` | no | | vpn\_status\_time\_aggregator | Monitor aggregator for VPN status [available values: min, max or avg] | string | `"max"` | no |
| vpn\_status\_timeframe | Monitor timeframe for VPN status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | vpn\_status\_timeframe | Monitor timeframe for VPN status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture Environment" description = "Architecture Environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -31,30 +31,31 @@ variable "filter_tags" {
variable "vpn_status_enabled" { variable "vpn_status_enabled" {
description = "Flag to enable VPN status monitor" description = "Flag to enable VPN status monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "vpn_status_extra_tags" { variable "vpn_status_extra_tags" {
description = "Extra tags for VPN status monitor" description = "Extra tags for VPN status monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "vpn_status_message" { variable "vpn_status_message" {
description = "Custom message for VPN status monitor" description = "Custom message for VPN status monitor"
type = "string" type = string
default = "" default = ""
} }
variable "vpn_status_time_aggregator" { variable "vpn_status_time_aggregator" {
description = "Monitor aggregator for VPN status [available values: min, max or avg]" description = "Monitor aggregator for VPN status [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "vpn_status_timeframe" { variable "vpn_status_timeframe" {
description = "Monitor timeframe for VPN status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for VPN status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }

View File

@ -1,24 +1,24 @@
resource "datadog_monitor" "VPN_status" { resource "datadog_monitor" "VPN_status" {
count = "${var.vpn_status_enabled == "true" ? 1 : 0}" count = var.vpn_status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] VPN tunnel down" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] VPN tunnel down"
message = "${coalesce(var.vpn_status_message, var.message)}" message = coalesce(var.vpn_status_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.vpn_status_time_aggregator}(${var.vpn_status_timeframe}): ( ${var.vpn_status_time_aggregator}(${var.vpn_status_timeframe}): (
min:aws.vpn.tunnel_state{${var.filter_tags}} by {region,tunnelipaddress} min:aws.vpn.tunnel_state{${var.filter_tags}} by {region,tunnelipaddress}
) < 1 ) < 1
EOQ EOQ
type = "query alert"
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
renotify_interval = 0 renotify_interval = 0
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
require_full_window = false require_full_window = false
tags = ["env:${var.environment}", "type:cloud", "provider:aws", "resource:vpn", "team:claranet", "created-by:terraform", "${var.vpn_status_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:vpn", "team:claranet", "created-by:terraform"], var.vpn_status_extra_tags)
} }

View File

@ -1,4 +1,5 @@
output "VPN_status_id" { output "VPN_status_id" {
description = "id for monitor VPN_status" description = "id for monitor VPN_status"
value = "${datadog_monitor.VPN_status.*.id}" value = datadog_monitor.VPN_status.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-azure-apimanagement" { module "datadog-monitors-cloud-azure-apimanagement" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/apimanagement?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/apimanagement?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -29,7 +29,7 @@ Creates DataDog monitors with the following checks:
| environment | Architecture environment | string | n/a | yes | | environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | | evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| failed\_requests\_enabled | Flag to enable API Management failed requests monitor | string | `"true"` | no | | failed\_requests\_enabled | Flag to enable API Management failed requests monitor | string | `"true"` | no |
| failed\_requests\_extra\_tags | Extra tags for API Management failed requests monitor | list | `[]` | no | | failed\_requests\_extra\_tags | Extra tags for API Management failed requests monitor | list(string) | `[]` | no |
| failed\_requests\_message | Custom message for API Management failed requests monitor | string | `""` | no | | failed\_requests\_message | Custom message for API Management failed requests monitor | string | `""` | no |
| failed\_requests\_threshold\_critical | Maximum acceptable percent of failed requests | string | `"90"` | no | | failed\_requests\_threshold\_critical | Maximum acceptable percent of failed requests | string | `"90"` | no |
| failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed requests | string | `"50"` | no | | failed\_requests\_threshold\_warning | Warning regarding acceptable percent of failed requests | string | `"50"` | no |
@ -41,7 +41,7 @@ Creates DataDog monitors with the following checks:
| message | Message sent when a Redis monitor is triggered | string | n/a | yes | | message | Message sent when a Redis monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| other\_requests\_enabled | Flag to enable API Management other requests monitor | string | `"true"` | no | | other\_requests\_enabled | Flag to enable API Management other requests monitor | string | `"true"` | no |
| other\_requests\_extra\_tags | Extra tags for API Management other requests monitor | list | `[]` | no | | other\_requests\_extra\_tags | Extra tags for API Management other requests monitor | list(string) | `[]` | no |
| other\_requests\_message | Custom message for API Management other requests monitor | string | `""` | no | | other\_requests\_message | Custom message for API Management other requests monitor | string | `""` | no |
| other\_requests\_threshold\_critical | Maximum acceptable percent of other requests | string | `"90"` | no | | other\_requests\_threshold\_critical | Maximum acceptable percent of other requests | string | `"90"` | no |
| other\_requests\_threshold\_warning | Warning regarding acceptable percent of other requests | string | `"50"` | no | | other\_requests\_threshold\_warning | Warning regarding acceptable percent of other requests | string | `"50"` | no |
@ -49,19 +49,19 @@ Creates DataDog monitors with the following checks:
| other\_requests\_timeframe | Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | other\_requests\_timeframe | Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| status\_enabled | Flag to enable API Management status monitor | string | `"true"` | no | | status\_enabled | Flag to enable API Management status monitor | string | `"true"` | no |
| status\_extra\_tags | Extra tags for API Management status monitor | list | `[]` | no | | status\_extra\_tags | Extra tags for API Management status monitor | list(string) | `[]` | no |
| status\_message | Custom message for API Management status monitor | string | `""` | no | | status\_message | Custom message for API Management status monitor | string | `""` | no |
| status\_time\_aggregator | Monitor aggregator for API Management status [available values: min, max or avg] | string | `"max"` | no | | status\_time\_aggregator | Monitor aggregator for API Management status [available values: min, max or avg] | string | `"max"` | no |
| status\_timeframe | Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | status\_timeframe | Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| successful\_requests\_enabled | Flag to enable API Management successful requests monitor | string | `"true"` | no | | successful\_requests\_enabled | Flag to enable API Management successful requests monitor | string | `"true"` | no |
| successful\_requests\_extra\_tags | Extra tags for API Management successful requests monitor | list | `[]` | no | | successful\_requests\_extra\_tags | Extra tags for API Management successful requests monitor | list(string) | `[]` | no |
| successful\_requests\_message | Custom message for API Management successful requests monitor | string | `""` | no | | successful\_requests\_message | Custom message for API Management successful requests monitor | string | `""` | no |
| successful\_requests\_threshold\_critical | Minimum acceptable percent of successful requests | string | `"10"` | no | | successful\_requests\_threshold\_critical | Minimum acceptable percent of successful requests | string | `"10"` | no |
| successful\_requests\_threshold\_warning | Warning regarding acceptable percent of successful requests | string | `"30"` | no | | successful\_requests\_threshold\_warning | Warning regarding acceptable percent of successful requests | string | `"30"` | no |
| successful\_requests\_time\_aggregator | Monitor aggregator for API Management successful requests [available values: min, max or avg] | string | `"max"` | no | | successful\_requests\_time\_aggregator | Monitor aggregator for API Management successful requests [available values: min, max or avg] | string | `"max"` | no |
| successful\_requests\_timeframe | Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | successful\_requests\_timeframe | Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| unauthorized\_requests\_enabled | Flag to enable API Management unauthorized requests monitor | string | `"true"` | no | | unauthorized\_requests\_enabled | Flag to enable API Management unauthorized requests monitor | string | `"true"` | no |
| unauthorized\_requests\_extra\_tags | Extra tags for API Management unauthorized requests monitor | list | `[]` | no | | unauthorized\_requests\_extra\_tags | Extra tags for API Management unauthorized requests monitor | list(string) | `[]` | no |
| unauthorized\_requests\_message | Custom message for API Management unauthorized requests monitor | string | `""` | no | | unauthorized\_requests\_message | Custom message for API Management unauthorized requests monitor | string | `""` | no |
| unauthorized\_requests\_threshold\_critical | Maximum acceptable percent of unauthorized requests | string | `"90"` | no | | unauthorized\_requests\_threshold\_critical | Maximum acceptable percent of unauthorized requests | string | `"90"` | no |
| unauthorized\_requests\_threshold\_warning | Warning regarding acceptable percent of unauthorized requests | string | `"50"` | no | | unauthorized\_requests\_threshold\_warning | Warning regarding acceptable percent of unauthorized requests | string | `"50"` | no |

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = string
} }
# Global DataDog # Global DataDog
@ -43,61 +43,61 @@ variable "filter_tags_custom_excluded" {
variable "status_enabled" { variable "status_enabled" {
description = "Flag to enable API Management status monitor" description = "Flag to enable API Management status monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "status_extra_tags" { variable "status_extra_tags" {
description = "Extra tags for API Management status monitor" description = "Extra tags for API Management status monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "status_message" { variable "status_message" {
description = "Custom message for API Management status monitor" description = "Custom message for API Management status monitor"
type = "string" type = string
default = "" default = ""
} }
variable "status_time_aggregator" { variable "status_time_aggregator" {
description = "Monitor aggregator for API Management status [available values: min, max or avg]" description = "Monitor aggregator for API Management status [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "status_timeframe" { variable "status_timeframe" {
description = "Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API Management status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "failed_requests_enabled" { variable "failed_requests_enabled" {
description = "Flag to enable API Management failed requests monitor" description = "Flag to enable API Management failed requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "failed_requests_extra_tags" { variable "failed_requests_extra_tags" {
description = "Extra tags for API Management failed requests monitor" description = "Extra tags for API Management failed requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "failed_requests_message" { variable "failed_requests_message" {
description = "Custom message for API Management failed requests monitor" description = "Custom message for API Management failed requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "failed_requests_time_aggregator" { variable "failed_requests_time_aggregator" {
description = "Monitor aggregator for API Management failed requests [available values: min, max or avg]" description = "Monitor aggregator for API Management failed requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "failed_requests_timeframe" { variable "failed_requests_timeframe" {
description = "Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API Management failed requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -113,31 +113,31 @@ variable "failed_requests_threshold_warning" {
variable "other_requests_enabled" { variable "other_requests_enabled" {
description = "Flag to enable API Management other requests monitor" description = "Flag to enable API Management other requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "other_requests_extra_tags" { variable "other_requests_extra_tags" {
description = "Extra tags for API Management other requests monitor" description = "Extra tags for API Management other requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "other_requests_message" { variable "other_requests_message" {
description = "Custom message for API Management other requests monitor" description = "Custom message for API Management other requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "other_requests_time_aggregator" { variable "other_requests_time_aggregator" {
description = "Monitor aggregator for API Management other requests [available values: min, max or avg]" description = "Monitor aggregator for API Management other requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "other_requests_timeframe" { variable "other_requests_timeframe" {
description = "Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API Management other requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -153,31 +153,31 @@ variable "other_requests_threshold_warning" {
variable "unauthorized_requests_enabled" { variable "unauthorized_requests_enabled" {
description = "Flag to enable API Management unauthorized requests monitor" description = "Flag to enable API Management unauthorized requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "unauthorized_requests_extra_tags" { variable "unauthorized_requests_extra_tags" {
description = "Extra tags for API Management unauthorized requests monitor" description = "Extra tags for API Management unauthorized requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "unauthorized_requests_message" { variable "unauthorized_requests_message" {
description = "Custom message for API Management unauthorized requests monitor" description = "Custom message for API Management unauthorized requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "unauthorized_requests_time_aggregator" { variable "unauthorized_requests_time_aggregator" {
description = "Monitor aggregator for API Management unauthorized requests [available values: min, max or avg]" description = "Monitor aggregator for API Management unauthorized requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "unauthorized_requests_timeframe" { variable "unauthorized_requests_timeframe" {
description = "Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API Management unauthorized requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -193,31 +193,31 @@ variable "unauthorized_requests_threshold_warning" {
variable "successful_requests_enabled" { variable "successful_requests_enabled" {
description = "Flag to enable API Management successful requests monitor" description = "Flag to enable API Management successful requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "successful_requests_extra_tags" { variable "successful_requests_extra_tags" {
description = "Extra tags for API Management successful requests monitor" description = "Extra tags for API Management successful requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "successful_requests_message" { variable "successful_requests_message" {
description = "Custom message for API Management successful requests monitor" description = "Custom message for API Management successful requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "successful_requests_time_aggregator" { variable "successful_requests_time_aggregator" {
description = "Monitor aggregator for API Management successful requests [available values: min, max or avg]" description = "Monitor aggregator for API Management successful requests [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "successful_requests_timeframe" { variable "successful_requests_timeframe" {
description = "Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for API Management successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -230,3 +230,4 @@ variable "successful_requests_threshold_warning" {
description = "Warning regarding acceptable percent of successful requests" description = "Warning regarding acceptable percent of successful requests"
default = 30 default = 30
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "azure_apimanagement" resource = "azure_apimanagement"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,128 +1,128 @@
resource "datadog_monitor" "apimgt_status" { resource "datadog_monitor" "apimgt_status" {
count = "${var.status_enabled == "true" ? 1 : 0}" count = var.status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management is down" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management is down"
message = "${coalesce(var.status_message, var.message)}" message = coalesce(var.status_message, var.message)
type = "metric alert"
query = <<EOQ query = <<EOQ
${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.apimanagement_service.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1 ${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.apimanagement_service.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1
EOQ EOQ
type = "metric alert" thresholds = {
thresholds {
critical = 1 critical = 1
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform", "${var.status_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform"], var.status_extra_tags)
} }
resource "datadog_monitor" "apimgt_failed_requests" { resource "datadog_monitor" "apimgt_failed_requests" {
count = "${var.failed_requests_enabled == "true" ? 1 : 0}" count = var.failed_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many failed requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.failed_requests_message, var.message)}" message = coalesce(var.failed_requests_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.failed_requests_time_aggregator}(${var.failed_requests_timeframe}): ( ${var.failed_requests_time_aggregator}(${var.failed_requests_timeframe}): (
default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / default(avg:azure.apimanagement_service.failed_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
) * 100 > ${var.failed_requests_threshold_critical} ) * 100 > ${var.failed_requests_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.failed_requests_threshold_critical}" critical = var.failed_requests_threshold_critical
warning = "${var.failed_requests_threshold_warning}" warning = var.failed_requests_threshold_warning
} }
type = "query alert" new_host_delay = var.new_host_delay
evaluation_delay = var.evaluation_delay
notify_no_data = false notify_no_data = false
notify_audit = false notify_audit = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform", "${var.failed_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform"], var.failed_requests_extra_tags)
} }
resource "datadog_monitor" "apimgt_other_requests" { resource "datadog_monitor" "apimgt_other_requests" {
count = "${var.other_requests_enabled == "true" ? 1 : 0}" count = var.other_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many other requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many other requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.other_requests_message, var.message)}" message = coalesce(var.other_requests_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): ( ${var.other_requests_time_aggregator}(${var.other_requests_timeframe}): (
default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / default(avg:azure.apimanagement_service.other_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
) * 100 > ${var.other_requests_threshold_critical} ) * 100 > ${var.other_requests_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.other_requests_threshold_critical}" critical = var.other_requests_threshold_critical
warning = "${var.other_requests_threshold_warning}" warning = var.other_requests_threshold_warning
} }
type = "query alert" new_host_delay = var.new_host_delay
evaluation_delay = var.evaluation_delay
notify_no_data = false notify_no_data = false
notify_audit = false notify_audit = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform", "${var.other_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform"], var.other_requests_extra_tags)
} }
resource "datadog_monitor" "apimgt_unauthorized_requests" { resource "datadog_monitor" "apimgt_unauthorized_requests" {
count = "${var.unauthorized_requests_enabled == "true" ? 1 : 0}" count = var.unauthorized_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management too many unauthorized requests {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.unauthorized_requests_message, var.message)}" message = coalesce(var.unauthorized_requests_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): ( ${var.unauthorized_requests_time_aggregator}(${var.unauthorized_requests_timeframe}): (
default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / default(avg:azure.apimanagement_service.unauthorized_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1) default(avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
) * 100 > ${var.unauthorized_requests_threshold_critical} ) * 100 > ${var.unauthorized_requests_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.unauthorized_requests_threshold_critical}" critical = var.unauthorized_requests_threshold_critical
warning = "${var.unauthorized_requests_threshold_warning}" warning = var.unauthorized_requests_threshold_warning
} }
type = "query alert" new_host_delay = var.new_host_delay
evaluation_delay = var.evaluation_delay
notify_no_data = false notify_no_data = false
notify_audit = false notify_audit = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform", "${var.unauthorized_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform"], var.unauthorized_requests_extra_tags)
} }
resource "datadog_monitor" "apimgt_successful_requests" { resource "datadog_monitor" "apimgt_successful_requests" {
count = "${var.successful_requests_enabled == "true" ? 1 : 0}" count = var.successful_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] API Management successful requests rate too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.successful_requests_message, var.message)}" message = coalesce(var.successful_requests_message, var.message)
type = "query alert"
query = <<EOQ query = <<EOQ
${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}): ${var.successful_requests_time_aggregator}(${var.successful_requests_timeframe}):
@ -131,23 +131,23 @@ resource "datadog_monitor" "apimgt_successful_requests" {
avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate() avg:azure.apimanagement_service.total_requests${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate()
* 100 * 100
, 100) < ${var.successful_requests_threshold_critical} , 100) < ${var.successful_requests_threshold_critical}
EOQ EOQ
thresholds { thresholds = {
critical = "${var.successful_requests_threshold_critical}" critical = var.successful_requests_threshold_critical
warning = "${var.successful_requests_threshold_warning}" warning = var.successful_requests_threshold_warning
} }
type = "query alert" new_host_delay = var.new_host_delay
evaluation_delay = var.evaluation_delay
notify_no_data = false notify_no_data = false
notify_audit = false notify_audit = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
locked = false locked = false
require_full_window = false require_full_window = false
new_host_delay = "${var.new_host_delay}"
evaluation_delay = "${var.evaluation_delay}"
renotify_interval = 0 renotify_interval = 0
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform", "${var.successful_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:apimanagement", "team:claranet", "created-by:terraform"], var.successful_requests_extra_tags)
} }

View File

@ -1,24 +1,25 @@
output "apimgt_status_id" { output "apimgt_status_id" {
description = "id for monitor apimgt_status" description = "id for monitor apimgt_status"
value = "${datadog_monitor.apimgt_status.*.id}" value = datadog_monitor.apimgt_status.*.id
} }
output "apimgt_failed_requests_id" { output "apimgt_failed_requests_id" {
description = "id for monitor apimgt_failed_requests" description = "id for monitor apimgt_failed_requests"
value = "${datadog_monitor.apimgt_failed_requests.*.id}" value = datadog_monitor.apimgt_failed_requests.*.id
} }
output "apimgt_other_requests_id" { output "apimgt_other_requests_id" {
description = "id for monitor apimgt_other_requests" description = "id for monitor apimgt_other_requests"
value = "${datadog_monitor.apimgt_other_requests.*.id}" value = datadog_monitor.apimgt_other_requests.*.id
} }
output "apimgt_unauthorized_requests_id" { output "apimgt_unauthorized_requests_id" {
description = "id for monitor apimgt_unauthorized_requests" description = "id for monitor apimgt_unauthorized_requests"
value = "${datadog_monitor.apimgt_unauthorized_requests.*.id}" value = datadog_monitor.apimgt_unauthorized_requests.*.id
} }
output "apimgt_successful_requests_id" { output "apimgt_successful_requests_id" {
description = "id for monitor apimgt_successful_requests" description = "id for monitor apimgt_successful_requests"
value = "${datadog_monitor.apimgt_successful_requests.*.id}" value = datadog_monitor.apimgt_successful_requests.*.id
} }

View File

@ -6,8 +6,8 @@
module "datadog-monitors-cloud-azure-app-services" { module "datadog-monitors-cloud-azure-app-services" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/app-services?ref={revision}" source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/azure/app-services?ref={revision}"
environment = "${var.environment}" environment = var.environment
message = "${module.datadog-message-alerting.alerting-message}" message = module.datadog-message-alerting.alerting-message
} }
``` ```
@ -33,28 +33,28 @@ Creates DataDog monitors with the following checks:
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | | filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | | filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| http\_4xx\_requests\_enabled | Flag to enable App Services 4xx requests monitor | string | `"true"` | no | | http\_4xx\_requests\_enabled | Flag to enable App Services 4xx requests monitor | string | `"true"` | no |
| http\_4xx\_requests\_extra\_tags | Extra tags for App Services 4xx requests monitor | list | `[]` | no | | http\_4xx\_requests\_extra\_tags | Extra tags for App Services 4xx requests monitor | list(string) | `[]` | no |
| http\_4xx\_requests\_message | Custom message for App Services 4xx requests monitor | string | `""` | no | | http\_4xx\_requests\_message | Custom message for App Services 4xx requests monitor | string | `""` | no |
| http\_4xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 4xx errors | string | `"90"` | no | | http\_4xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 4xx errors | string | `"90"` | no |
| http\_4xx\_requests\_threshold\_warning | Warning regarding acceptable percent of 4xx errors | string | `"50"` | no | | http\_4xx\_requests\_threshold\_warning | Warning regarding acceptable percent of 4xx errors | string | `"50"` | no |
| http\_4xx\_requests\_time\_aggregator | Monitor aggregator for App Services 4xx requests [available values: min, max or avg] | string | `"min"` | no | | http\_4xx\_requests\_time\_aggregator | Monitor aggregator for App Services 4xx requests [available values: min, max or avg] | string | `"min"` | no |
| http\_4xx\_requests\_timeframe | Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | http\_4xx\_requests\_timeframe | Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| http\_5xx\_requests\_enabled | Flag to enable App Services 5xx requests monitor | string | `"true"` | no | | http\_5xx\_requests\_enabled | Flag to enable App Services 5xx requests monitor | string | `"true"` | no |
| http\_5xx\_requests\_extra\_tags | Extra tags for App Services 5xx requests monitor | list | `[]` | no | | http\_5xx\_requests\_extra\_tags | Extra tags for App Services 5xx requests monitor | list(string) | `[]` | no |
| http\_5xx\_requests\_message | Custom message for App Services 5xx requests monitor | string | `""` | no | | http\_5xx\_requests\_message | Custom message for App Services 5xx requests monitor | string | `""` | no |
| http\_5xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 5xx errors | string | `"90"` | no | | http\_5xx\_requests\_threshold\_critical | Maximum critical acceptable percent of 5xx errors | string | `"90"` | no |
| http\_5xx\_requests\_threshold\_warning | Warning regarding acceptable percent of 5xx errors | string | `"50"` | no | | http\_5xx\_requests\_threshold\_warning | Warning regarding acceptable percent of 5xx errors | string | `"50"` | no |
| http\_5xx\_requests\_time\_aggregator | Monitor aggregator for App Services 5xx requests [available values: min, max or avg] | string | `"min"` | no | | http\_5xx\_requests\_time\_aggregator | Monitor aggregator for App Services 5xx requests [available values: min, max or avg] | string | `"min"` | no |
| http\_5xx\_requests\_timeframe | Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | http\_5xx\_requests\_timeframe | Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| http\_successful\_requests\_enabled | Flag to enable App Services successful requests monitor | string | `"true"` | no | | http\_successful\_requests\_enabled | Flag to enable App Services successful requests monitor | string | `"true"` | no |
| http\_successful\_requests\_extra\_tags | Extra tags for App Services successful requests monitor | list | `[]` | no | | http\_successful\_requests\_extra\_tags | Extra tags for App Services successful requests monitor | list(string) | `[]` | no |
| http\_successful\_requests\_message | Custom message for App Services successful requests monitor | string | `""` | no | | http\_successful\_requests\_message | Custom message for App Services successful requests monitor | string | `""` | no |
| http\_successful\_requests\_threshold\_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `"10"` | no | | http\_successful\_requests\_threshold\_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `"10"` | no |
| http\_successful\_requests\_threshold\_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `"30"` | no | | http\_successful\_requests\_threshold\_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `"30"` | no |
| http\_successful\_requests\_time\_aggregator | Monitor aggregator for App Services successful requests [available values: min, max or avg] | string | `"max"` | no | | http\_successful\_requests\_time\_aggregator | Monitor aggregator for App Services successful requests [available values: min, max or avg] | string | `"max"` | no |
| http\_successful\_requests\_timeframe | Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | http\_successful\_requests\_timeframe | Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| memory\_usage\_enabled | Flag to enable App Services memory usage monitor | string | `"true"` | no | | memory\_usage\_enabled | Flag to enable App Services memory usage monitor | string | `"true"` | no |
| memory\_usage\_extra\_tags | Extra tags for App Services memory usage monitor | list | `[]` | no | | memory\_usage\_extra\_tags | Extra tags for App Services memory usage monitor | list(string) | `[]` | no |
| memory\_usage\_message | Custom message for App Services memory usage monitor | string | `""` | no | | memory\_usage\_message | Custom message for App Services memory usage monitor | string | `""` | no |
| memory\_usage\_threshold\_critical | Alerting threshold in Mib | string | `"1073741824"` | no | | memory\_usage\_threshold\_critical | Alerting threshold in Mib | string | `"1073741824"` | no |
| memory\_usage\_threshold\_warning | Warning threshold in MiB | string | `"536870912"` | no | | memory\_usage\_threshold\_warning | Warning threshold in MiB | string | `"536870912"` | no |
@ -64,14 +64,14 @@ Creates DataDog monitors with the following checks:
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | | prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| response\_time\_enabled | Flag to enable App Services response time monitor | string | `"true"` | no | | response\_time\_enabled | Flag to enable App Services response time monitor | string | `"true"` | no |
| response\_time\_extra\_tags | Extra tags for App Services response time monitor | list | `[]` | no | | response\_time\_extra\_tags | Extra tags for App Services response time monitor | list(string) | `[]` | no |
| response\_time\_message | Custom message for App Services response time monitor | string | `""` | no | | response\_time\_message | Custom message for App Services response time monitor | string | `""` | no |
| response\_time\_threshold\_critical | Alerting threshold for response time in seconds | string | `"10"` | no | | response\_time\_threshold\_critical | Alerting threshold for response time in seconds | string | `"10"` | no |
| response\_time\_threshold\_warning | Warning threshold for response time in seconds | string | `"5"` | no | | response\_time\_threshold\_warning | Warning threshold for response time in seconds | string | `"5"` | no |
| response\_time\_time\_aggregator | Monitor aggregator for App Services response time [available values: min, max or avg] | string | `"min"` | no | | response\_time\_time\_aggregator | Monitor aggregator for App Services response time [available values: min, max or avg] | string | `"min"` | no |
| response\_time\_timeframe | Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | response\_time\_timeframe | Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| status\_enabled | Flag to enable App Services status monitor | string | `"true"` | no | | status\_enabled | Flag to enable App Services status monitor | string | `"true"` | no |
| status\_extra\_tags | Extra tags for App Services status monitor | list | `[]` | no | | status\_extra\_tags | Extra tags for App Services status monitor | list(string) | `[]` | no |
| status\_message | Custom message for App Services status monitor | string | `""` | no | | status\_message | Custom message for App Services status monitor | string | `""` | no |
| status\_time\_aggregator | Monitor aggregator for App Services status [available values: min, max or avg] | string | `"max"` | no | | status\_time\_aggregator | Monitor aggregator for App Services status [available values: min, max or avg] | string | `"max"` | no |
| status\_timeframe | Monitor timeframe for App Services status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | | status\_timeframe | Monitor timeframe for App Services status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |

View File

@ -1,6 +1,6 @@
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = string
} }
variable "filter_tags_use_defaults" { variable "filter_tags_use_defaults" {
@ -41,31 +41,31 @@ variable "prefix_slug" {
variable "response_time_enabled" { variable "response_time_enabled" {
description = "Flag to enable App Services response time monitor" description = "Flag to enable App Services response time monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "response_time_extra_tags" { variable "response_time_extra_tags" {
description = "Extra tags for App Services response time monitor" description = "Extra tags for App Services response time monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "response_time_message" { variable "response_time_message" {
description = "Custom message for App Services response time monitor" description = "Custom message for App Services response time monitor"
type = "string" type = string
default = "" default = ""
} }
variable "response_time_time_aggregator" { variable "response_time_time_aggregator" {
description = "Monitor aggregator for App Services response time [available values: min, max or avg]" description = "Monitor aggregator for App Services response time [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "response_time_timeframe" { variable "response_time_timeframe" {
description = "Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services response time [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -81,71 +81,71 @@ variable "response_time_threshold_warning" {
variable "memory_usage_enabled" { variable "memory_usage_enabled" {
description = "Flag to enable App Services memory usage monitor" description = "Flag to enable App Services memory usage monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "memory_usage_extra_tags" { variable "memory_usage_extra_tags" {
description = "Extra tags for App Services memory usage monitor" description = "Extra tags for App Services memory usage monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "memory_usage_message" { variable "memory_usage_message" {
description = "Custom message for App Services memory usage monitor" description = "Custom message for App Services memory usage monitor"
type = "string" type = string
default = "" default = ""
} }
variable "memory_usage_time_aggregator" { variable "memory_usage_time_aggregator" {
description = "Monitor aggregator for App Services memory usage [available values: min, max or avg]" description = "Monitor aggregator for App Services memory usage [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "memory_usage_timeframe" { variable "memory_usage_timeframe" {
description = "Monitor timeframe for App Services memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
variable "memory_usage_threshold_critical" { variable "memory_usage_threshold_critical" {
default = 1073741824 # 1Gb default = 1073741824 # 1Gb
description = "Alerting threshold in Mib" description = "Alerting threshold in Mib"
} }
variable "memory_usage_threshold_warning" { variable "memory_usage_threshold_warning" {
default = 536870912 # 512Mb default = 536870912 # 512Mb
description = "Warning threshold in MiB" description = "Warning threshold in MiB"
} }
variable "http_4xx_requests_enabled" { variable "http_4xx_requests_enabled" {
description = "Flag to enable App Services 4xx requests monitor" description = "Flag to enable App Services 4xx requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "http_4xx_requests_extra_tags" { variable "http_4xx_requests_extra_tags" {
description = "Extra tags for App Services 4xx requests monitor" description = "Extra tags for App Services 4xx requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "http_4xx_requests_message" { variable "http_4xx_requests_message" {
description = "Custom message for App Services 4xx requests monitor" description = "Custom message for App Services 4xx requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "http_4xx_requests_time_aggregator" { variable "http_4xx_requests_time_aggregator" {
description = "Monitor aggregator for App Services 4xx requests [available values: min, max or avg]" description = "Monitor aggregator for App Services 4xx requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "http_4xx_requests_timeframe" { variable "http_4xx_requests_timeframe" {
description = "Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services 4xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -161,31 +161,31 @@ variable "http_4xx_requests_threshold_warning" {
variable "http_5xx_requests_enabled" { variable "http_5xx_requests_enabled" {
description = "Flag to enable App Services 5xx requests monitor" description = "Flag to enable App Services 5xx requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "http_5xx_requests_extra_tags" { variable "http_5xx_requests_extra_tags" {
description = "Extra tags for App Services 5xx requests monitor" description = "Extra tags for App Services 5xx requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "http_5xx_requests_message" { variable "http_5xx_requests_message" {
description = "Custom message for App Services 5xx requests monitor" description = "Custom message for App Services 5xx requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "http_5xx_requests_time_aggregator" { variable "http_5xx_requests_time_aggregator" {
description = "Monitor aggregator for App Services 5xx requests [available values: min, max or avg]" description = "Monitor aggregator for App Services 5xx requests [available values: min, max or avg]"
type = "string" type = string
default = "min" default = "min"
} }
variable "http_5xx_requests_timeframe" { variable "http_5xx_requests_timeframe" {
description = "Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services 5xx requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -201,31 +201,31 @@ variable "http_5xx_requests_threshold_warning" {
variable "http_successful_requests_enabled" { variable "http_successful_requests_enabled" {
description = "Flag to enable App Services successful requests monitor" description = "Flag to enable App Services successful requests monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "http_successful_requests_extra_tags" { variable "http_successful_requests_extra_tags" {
description = "Extra tags for App Services successful requests monitor" description = "Extra tags for App Services successful requests monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "http_successful_requests_message" { variable "http_successful_requests_message" {
description = "Custom message for App Services successful requests monitor" description = "Custom message for App Services successful requests monitor"
type = "string" type = string
default = "" default = ""
} }
variable "http_successful_requests_time_aggregator" { variable "http_successful_requests_time_aggregator" {
description = "Monitor aggregator for App Services successful requests [available values: min, max or avg]" description = "Monitor aggregator for App Services successful requests [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "http_successful_requests_timeframe" { variable "http_successful_requests_timeframe" {
description = "Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services successful requests [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }
@ -241,30 +241,31 @@ variable "http_successful_requests_threshold_warning" {
variable "status_enabled" { variable "status_enabled" {
description = "Flag to enable App Services status monitor" description = "Flag to enable App Services status monitor"
type = "string" type = string
default = "true" default = "true"
} }
variable "status_message" { variable "status_message" {
description = "Custom message for App Services status monitor" description = "Custom message for App Services status monitor"
type = "string" type = string
default = "" default = ""
} }
variable "status_extra_tags" { variable "status_extra_tags" {
description = "Extra tags for App Services status monitor" description = "Extra tags for App Services status monitor"
type = "list" type = list(string)
default = [] default = []
} }
variable "status_time_aggregator" { variable "status_time_aggregator" {
description = "Monitor aggregator for App Services status [available values: min, max or avg]" description = "Monitor aggregator for App Services status [available values: min, max or avg]"
type = "string" type = string
default = "max" default = "max"
} }
variable "status_timeframe" { variable "status_timeframe" {
description = "Monitor timeframe for App Services status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for App Services status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string" type = string
default = "last_5m" default = "last_5m"
} }

View File

@ -1,9 +1,10 @@
module "filter-tags" { module "filter-tags" {
source = "../../../common/filter-tags" source = "../../../common/filter-tags"
environment = "${var.environment}" environment = var.environment
resource = "azure_app-services" resource = "azure_app-services"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}" filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = "${var.filter_tags_custom}" filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" filter_tags_custom_excluded = var.filter_tags_custom_excluded
} }

View File

@ -1,131 +1,127 @@
# Monitoring App Services response time # Monitoring App Services response time
resource "datadog_monitor" "appservices_response_time" { resource "datadog_monitor" "appservices_response_time" {
count = "${var.response_time_enabled == "true" ? 1 : 0}" count = var.response_time_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services response time too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services response time too high {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = coalesce(var.response_time_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.response_time_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.response_time_time_aggregator}(${var.response_time_timeframe}): ( ${var.response_time_time_aggregator}(${var.response_time_timeframe}): (
default(avg:azure.app_services.average_response_time${module.filter-tags.query_alert} by {resource_group,region,name,instance}, 0) default(avg:azure.app_services.average_response_time${module.filter-tags.query_alert} by {resource_group,region,name,instance}, 0)
) > ${var.response_time_threshold_critical} ) > ${var.response_time_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.response_time_threshold_warning
critical = var.response_time_threshold_critical
thresholds {
warning = "${var.response_time_threshold_warning}"
critical = "${var.response_time_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.response_time_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.response_time_extra_tags)
} }
# Monitoring App Services memory usage # Monitoring App Services memory usage
resource "datadog_monitor" "appservices_memory_usage_count" { resource "datadog_monitor" "appservices_memory_usage_count" {
count = "${var.memory_usage_enabled == "true" ? 1 : 0}" count = var.memory_usage_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services memory usage {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services memory usage {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.memory_usage_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.memory_usage_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): ( ${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
avg:azure.app_services.memory_working_set${module.filter-tags.query_alert} by {resource_group,region,name,instance} avg:azure.app_services.memory_working_set${module.filter-tags.query_alert} by {resource_group,region,name,instance}
) > ${var.memory_usage_threshold_critical} ) > ${var.memory_usage_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.memory_usage_threshold_warning
critical = var.memory_usage_threshold_critical
thresholds {
warning = "${var.memory_usage_threshold_warning}"
critical = "${var.memory_usage_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.memory_usage_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.memory_usage_extra_tags)
} }
# Monitoring App Services 5xx errors percent # Monitoring App Services 5xx errors percent
resource "datadog_monitor" "appservices_http_5xx_errors_count" { resource "datadog_monitor" "appservices_http_5xx_errors_count" {
count = "${var.http_5xx_requests_enabled == "true" ? 1 : 0}" count = var.http_5xx_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.http_5xx_requests_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.http_5xx_requests_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): ( ${var.http_5xx_requests_time_aggregator}(${var.http_5xx_requests_timeframe}): (
default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) / default(avg:azure.app_services.http5xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1) default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
) * 100 > ${var.http_5xx_requests_threshold_critical} ) * 100 > ${var.http_5xx_requests_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.http_5xx_requests_threshold_warning
critical = var.http_5xx_requests_threshold_critical
thresholds {
warning = "${var.http_5xx_requests_threshold_warning}"
critical = "${var.http_5xx_requests_threshold_critical}"
} }
notify_no_data = false # Will NOT notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.http_5xx_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.http_5xx_requests_extra_tags)
} }
# Monitoring App Services 4xx errors percent # Monitoring App Services 4xx errors percent
resource "datadog_monitor" "appservices_http_4xx_errors_count" { resource "datadog_monitor" "appservices_http_4xx_errors_count" {
count = "${var.http_4xx_requests_enabled == "true" ? 1 : 0}" count = var.http_4xx_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP 4xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = coalesce(var.http_4xx_requests_message, var.message)
type = "query alert" type = "query alert"
message = "${coalesce(var.http_4xx_requests_message, var.message)}"
query = <<EOQ query = <<EOQ
${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): ( ${var.http_4xx_requests_time_aggregator}(${var.http_4xx_requests_timeframe}): (
default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) / default(avg:azure.app_services.http4xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) /
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1) default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 1)
) * 100 > ${var.http_4xx_requests_threshold_critical} ) * 100 > ${var.http_4xx_requests_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.http_4xx_requests_threshold_warning
critical = var.http_4xx_requests_threshold_critical
thresholds {
warning = "${var.http_4xx_requests_threshold_warning}"
critical = "${var.http_4xx_requests_threshold_critical}"
} }
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false # Will NOT notify when no data is received notify_no_data = false # Will NOT notify when no data is received
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.http_4xx_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.http_4xx_requests_extra_tags)
} }
# Monitoring App Services HTTP 2xx & 3xx status pages percent # Monitoring App Services HTTP 2xx & 3xx status pages percent
resource "datadog_monitor" "appservices_http_success_status_rate" { resource "datadog_monitor" "appservices_http_success_status_rate" {
count = "${var.http_successful_requests_enabled == "true" ? 1 : 0}" count = var.http_successful_requests_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP successful responses too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services HTTP successful responses too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "query alert" type = "query alert"
message = "${coalesce(var.http_successful_requests_message, var.message)}" message = coalesce(var.http_successful_requests_message, var.message)
query = <<EOQ query = <<EOQ
${var.http_successful_requests_time_aggregator}(${var.http_successful_requests_timeframe}): ${var.http_successful_requests_time_aggregator}(${var.http_successful_requests_timeframe}):
@ -134,47 +130,46 @@ resource "datadog_monitor" "appservices_http_success_status_rate" {
default(avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) ) / default(avg:azure.app_services.http3xx${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) ) /
default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0) default(avg:azure.app_services.requests${module.filter-tags.query_alert} by {resource_group,region,name,instance}.as_rate(), 0)
) * 100, 100) < ${var.http_successful_requests_threshold_critical} ) * 100, 100) < ${var.http_successful_requests_threshold_critical}
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}" warning = var.http_successful_requests_threshold_warning
critical = var.http_successful_requests_threshold_critical
thresholds {
warning = "${var.http_successful_requests_threshold_warning}"
critical = "${var.http_successful_requests_threshold_critical}"
} }
notify_no_data = false # Will notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = false
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 1 timeout_h = 1
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.http_successful_requests_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.http_successful_requests_extra_tags)
} }
# Monitoring App Services status # Monitoring App Services status
resource "datadog_monitor" "appservices_status" { resource "datadog_monitor" "appservices_status" {
count = "${var.status_enabled == "true" ? 1 : 0}" count = var.status_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services is down" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] App Services is down"
type = "metric alert" type = "metric alert"
message = "${coalesce(var.status_message, var.message)}" message = coalesce(var.status_message, var.message)
query = <<EOQ query = <<EOQ
${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.app_services.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1 ${var.status_time_aggregator}(${var.status_timeframe}):avg:azure.app_services.status${module.filter-tags.query_alert} by {resource_group,region,name} < 1
EOQ EOQ
evaluation_delay = "${var.evaluation_delay}" thresholds = {
new_host_delay = "${var.new_host_delay}"
thresholds {
critical = 1 critical = 1
} }
notify_no_data = true # Will notify when no data is received evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_no_data = true
renotify_interval = 0 renotify_interval = 0
require_full_window = false require_full_window = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform", "${var.status_extra_tags}"] tags = concat(["env:${var.environment}", "type:cloud", "provider:azure", "resource:app-services", "team:claranet", "created-by:terraform"], var.status_extra_tags)
} }

View File

@ -1,29 +1,30 @@
output "appservices_response_time_id" { output "appservices_response_time_id" {
description = "id for monitor appservices_response_time" description = "id for monitor appservices_response_time"
value = "${datadog_monitor.appservices_response_time.*.id}" value = datadog_monitor.appservices_response_time.*.id
} }
output "appservices_memory_usage_count_id" { output "appservices_memory_usage_count_id" {
description = "id for monitor appservices_memory_usage_count" description = "id for monitor appservices_memory_usage_count"
value = "${datadog_monitor.appservices_memory_usage_count.*.id}" value = datadog_monitor.appservices_memory_usage_count.*.id
} }
output "appservices_http_5xx_errors_count_id" { output "appservices_http_5xx_errors_count_id" {
description = "id for monitor appservices_http_5xx_errors_count" description = "id for monitor appservices_http_5xx_errors_count"
value = "${datadog_monitor.appservices_http_5xx_errors_count.*.id}" value = datadog_monitor.appservices_http_5xx_errors_count.*.id
} }
output "appservices_http_4xx_errors_count_id" { output "appservices_http_4xx_errors_count_id" {
description = "id for monitor appservices_http_4xx_errors_count" description = "id for monitor appservices_http_4xx_errors_count"
value = "${datadog_monitor.appservices_http_4xx_errors_count.*.id}" value = datadog_monitor.appservices_http_4xx_errors_count.*.id
} }
output "appservices_http_success_status_rate_id" { output "appservices_http_success_status_rate_id" {
description = "id for monitor appservices_http_success_status_rate" description = "id for monitor appservices_http_success_status_rate"
value = "${datadog_monitor.appservices_http_success_status_rate.*.id}" value = datadog_monitor.appservices_http_success_status_rate.*.id
} }
output "appservices_status_id" { output "appservices_status_id" {
description = "id for monitor appservices_status" description = "id for monitor appservices_status"
value = "${datadog_monitor.appservices_status.*.id}" value = datadog_monitor.appservices_status.*.id
} }

Some files were not shown because too many files have changed in this diff Show More