From ce25b90c59de311d29c06df999f731ea7953485d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Fri, 24 Jan 2020 15:31:31 +0100 Subject: [PATCH] MON-221 change process to service check --- README.md | 2 +- database/zookeeper/README.md | 63 ++++++++++++-------- database/zookeeper/inputs.tf | 76 +++++++++++++++++++++--- database/zookeeper/monitors-zookeeper.tf | 55 +++++++++-------- database/zookeeper/outputs.tf | 6 +- 5 files changed, 141 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index f4cd71f..b62e5e7 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,7 @@ module "datadog-monitors-system-generic" { - [redis](https://github.com/claranet/terraform-datadog-monitors/tree/master/database/redis/) - [solr](https://github.com/claranet/terraform-datadog-monitors/tree/master/database/solr/) - [sqlserver](https://github.com/claranet/terraform-datadog-monitors/tree/master/database/sqlserver/) + - [zookeeper](https://github.com/claranet/terraform-datadog-monitors/tree/master/database/zookeeper/) - [middleware](https://github.com/claranet/terraform-datadog-monitors/tree/master/middleware/) - [apache](https://github.com/claranet/terraform-datadog-monitors/tree/master/middleware/apache/) - [kong](https://github.com/claranet/terraform-datadog-monitors/tree/master/middleware/kong/) @@ -219,4 +220,3 @@ module "datadog-monitors-system-generic" { - [system](https://github.com/claranet/terraform-datadog-monitors/tree/master/system/) - [generic](https://github.com/claranet/terraform-datadog-monitors/tree/master/system/generic/) - [unreachable](https://github.com/claranet/terraform-datadog-monitors/tree/master/system/unreachable/) - diff --git a/database/zookeeper/README.md b/database/zookeeper/README.md index 547d125..2109ada 100644 --- a/database/zookeeper/README.md +++ b/database/zookeeper/README.md @@ -2,9 +2,10 @@ ## How to use this module -``` +```hcl module "datadog-monitors-database-zookeeper" { - source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//database/zookeeper?ref={revision}" + source = "claranet/monitors/datadog//database/zookeeper" + version = "{revision}" environment = var.environment message = module.datadog-message-alerting.alerting-message @@ -17,39 +18,49 @@ module "datadog-monitors-database-zookeeper" { Creates DataDog monitors with the following checks: - Zookeeper latency -- Zookeeper process is down +- Zookeeper service does not respond ## Inputs | Name | Description | Type | Default | Required | -|------|-------------|:----:|:-----:|:-----:| -| environment | Architecture environment | string | n/a | yes | -| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"15"` | no | -| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | -| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | -| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | -| message | Message sent when a monitor is triggered | string | n/a | yes | -| new\_host\_delay | Delay in seconds before begin to monitor new host | string | `"300"` | no | -| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no | -| zookeeper\_latency\_availability\_extra\_tags | Extra tags for zookeeper read latency monitor | list(string) | `[]` | no | -| zookeeper\_latency\_enabled | Flag to enable Zookeeper read latency monitor | string | `"true"` | no | -| zookeeper\_latency\_status\_message | Custom message for Zookeeper read latency monitor | string | `""` | no | -| zookeeper\_latency\_threshold\_critical | Maximum critical acceptable ms of zookeeper latency monitor | string | `"300000"` | no | -| zookeeper\_latency\_threshold\_warning | Maximum warning acceptable ms of zookeeper latency monitor | string | `"250000"` | no | -| zookeeper\_latency\_time\_aggregator | Monitor time aggregator for Zookeeper read latency monitor [available values: min, max or avg] | string | `"avg"` | no | -| zookeeper\_latency\_timeframe | Monitor timeframe for Zookeeper read latency monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | -| zookeeper\_process\_enabled | Flag to enable Zookeeper does not respond monitor | string | `"true"` | no | -| zookeeper\_process\_extra\_tags | Extra tags for Zookeeper does not respond monitor | list(string) | `[]` | no | -| zookeeper\_process\_message | Custom message for Zookeeper does not respond monitor | string | `""` | no | -| zookeeper\_process\_time\_aggregator | Time aggregator for the Zookeeper does not respond monitor | string | `"avg"` | no | -| zookeeper\_process\_timeframe | Timeframe for the does not respond monitor | string | `"last_10m"` | no | +|------|-------------|------|---------|:-----:| +| environment | Architecture environment | `string` | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | `number` | `15` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter\_tags\_use\_defaults is false | `string` | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter\_tags\_use\_defaults is false | `string` | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | `string` | `"true"` | no | +| message | Message sent when a monitor is triggered | `any` | n/a | yes | +| new\_host\_delay | Delay in seconds before begin to monitor new host | `number` | `300` | no | +| not\_responding\_group\_by | List of tags to use to group data | `list(string)` |
[
"host",
"server"
]
| no | +| not\_responding\_locked | Lock Zookeeper not responding monitor | `bool` | `false` | no | +| not\_responding\_no\_data\_timeframe | Zookeeper monitor no\_data\_timeframe | `number` | `10` | no | +| not\_responding\_notify\_audit | Enable or not notify audit on Zookeeper not responding monitor | `bool` | `false` | no | +| not\_responding\_notify\_no\_data | Send notification if not\_responding monitor does not retrieve data | `bool` | `true` | no | +| not\_responding\_threshold\_warning | Zookeeper not responding limit (warning threshold) | `number` | `3` | no | +| not\_responding\_timeout\_h | Number of hour of Zookeeper not responding monitor not reporting data before it will automatically resolve from a triggered state | `number` | `0` | no | +| prefix\_slug | Prefix string to prepend between brackets on every monitors names | `string` | `""` | no | +| zookeeper\_latency\_availability\_extra\_tags | Extra tags for zookeeper read latency monitor | `list(string)` | `[]` | no | +| zookeeper\_latency\_enabled | Flag to enable Zookeeper read latency monitor | `string` | `"true"` | no | +| zookeeper\_latency\_group\_by | Tags to use to group datas | `list(string)` |
[
"host"
]
| no | +| zookeeper\_latency\_notify\_audit | Enable or not notify audit on Zookeeper latency monitor | `bool` | `false` | no | +| zookeeper\_latency\_status\_message | Custom message for Zookeeper read latency monitor | `string` | `""` | no | +| zookeeper\_latency\_threshold\_critical | Maximum critical acceptable ms of zookeeper latency monitor | `number` | `300000` | no | +| zookeeper\_latency\_threshold\_warning | Maximum warning acceptable ms of zookeeper latency monitor | `number` | `250000` | no | +| zookeeper\_latency\_time\_aggregator | Monitor time aggregator for Zookeeper read latency monitor [available values: min, max or avg] | `string` | `"avg"` | no | +| zookeeper\_latency\_timeframe | Monitor timeframe for Zookeeper read latency monitor [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | +| zookeeper\_latency\_timeout\_h | Number of hour of Zookeeper latency monitor not reporting data before it will automatically resolve from a triggered state | `number` | `0` | no | +| zookeeper\_not\_responding\_enabled | Flag to enable Zookeeper does not respond monitor | `string` | `"true"` | no | +| zookeeper\_not\_responding\_extra\_tags | Extra tags for Zookeeper does not respond monitor | `list(string)` | `[]` | no | +| zookeeper\_not\_responding\_message | Custom message for Zookeeper does not respond monitor | `string` | `""` | no | +| zookeeper\_not\_responding\_time\_aggregator | Time aggregator for the Zookeeper does not respond monitor | `string` | `"avg"` | no | +| zookeeper\_not\_responding\_timeframe | Timeframe for the does not respond monitor | `string` | `"last_5m"` | no | ## Outputs | Name | Description | |------|-------------| -| datadog\_monitor\_zookeeper\_latency\_id | id for monitor datadog_monitor_zookeeper_latency | -| datadog\_zookeeper\_process\_down\_id | id for monitor datadog_zookeeper_process_down | +| datadog\_monitor\_zookeeper\_latency\_id | id for monitor datadog\_monitor\_zookeeper\_latency | +| not\_responding\_id | id for monitor not\_responding | ## Related documentation * [Integration Datadog & ElasticSearch](https://docs.datadoghq.com/integrations/elastic/) diff --git a/database/zookeeper/inputs.tf b/database/zookeeper/inputs.tf index 42c0b04..73ddf01 100644 --- a/database/zookeeper/inputs.tf +++ b/database/zookeeper/inputs.tf @@ -40,45 +40,95 @@ variable "prefix_slug" { default = "" } +# # Service Check -variable "zookeeper_process_enabled" { +# +variable "not_responding_notify_no_data" { + default = true + type = bool + description = "Send notification if not_responding monitor does not retrieve data" +} +variable "not_responding_no_data_timeframe" { + default = 10 + description = "Zookeeper monitor no_data_timeframe" + type = number +} + +variable "not_responding_group_by" { + default = ["host", "server"] + type = list(string) + description = "List of tags to use to group data" +} + +variable "zookeeper_not_responding_enabled" { description = "Flag to enable Zookeeper does not respond monitor" type = string default = "true" } -variable "zookeeper_process_message" { +variable "zookeeper_not_responding_message" { description = "Custom message for Zookeeper does not respond monitor" type = string default = "" } -variable "zookeeper_process_time_aggregator" { +variable "zookeeper_not_responding_time_aggregator" { description = "Time aggregator for the Zookeeper does not respond monitor" type = string default = "avg" } -variable "zookeeper_process_timeframe" { +variable "zookeeper_not_responding_timeframe" { description = "Timeframe for the does not respond monitor" type = string - default = "last_10m" + default = "last_5m" } -variable "zookeeper_process_extra_tags" { +variable "zookeeper_not_responding_extra_tags" { description = "Extra tags for Zookeeper does not respond monitor" type = list(string) default = [] } +variable "not_responding_threshold_warning" { + default = 3 + type = number + description = "Zookeeper not responding limit (warning threshold)" +} -## Check read latency monitor +variable "not_responding_notify_audit" { + description = "Enable or not notify audit on Zookeeper not responding monitor" + type = bool + default = false +} + +variable "not_responding_locked" { + description = "Lock Zookeeper not responding monitor" + type = bool + default = false +} + +variable "not_responding_timeout_h" { + description = "Number of hour of Zookeeper not responding monitor not reporting data before it will automatically resolve from a triggered state" + type = number + default = 0 +} + +# +# Check read latency monitor +# variable "zookeeper_latency_enabled" { description = "Flag to enable Zookeeper read latency monitor" type = string default = "true" } +variable "zookeeper_latency_group_by" { + description = "Tags to use to group datas" + type = list(string) + default = ["host"] +} + variable "zookeeper_latency_status_message" { description = "Custom message for Zookeeper read latency monitor" type = string @@ -112,3 +162,15 @@ variable "zookeeper_latency_availability_extra_tags" { type = list(string) default = [] } + +variable "zookeeper_latency_notify_audit" { + description = "Enable or not notify audit on Zookeeper latency monitor" + type = bool + default = false +} + +variable "zookeeper_latency_timeout_h" { + description = "Number of hour of Zookeeper latency monitor not reporting data before it will automatically resolve from a triggered state" + type = number + default = 0 +} diff --git a/database/zookeeper/monitors-zookeeper.tf b/database/zookeeper/monitors-zookeeper.tf index ff25ac4..d6ee28e 100644 --- a/database/zookeeper/monitors-zookeeper.tf +++ b/database/zookeeper/monitors-zookeeper.tf @@ -1,39 +1,45 @@ -resource "datadog_monitor" "datadog_zookeeper_process_down" { - count = var.zookeeper_process_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Zookeeper process is down" - message = coalesce(var.zookeeper_process_message, var.message) - type = "metric alert" +resource "datadog_monitor" "not_responding" { + count = var.zookeeper_not_responding_enabled ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Zookeeper service does not respond {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" + message = coalesce(var.zookeeper_not_responding_message, var.message) + type = "service check" query = < ${var.zookeeper_latency_threshold_critical} + zookeeper.avg_latency${module.filter-tags.query_alert} by {${join(",", var.zookeeper_latency_group_by)}}) > ${var.zookeeper_latency_threshold_critical} EOQ thresholds = { @@ -42,17 +48,18 @@ EOQ } notify_no_data = false - evaluation_delay = 15 - new_host_delay = 300 - notify_audit = false - timeout_h = 0 + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + notify_audit = var.zookeeper_latency_notify_audit + timeout_h = var.zookeeper_latency_timeout_h include_tags = true locked = false require_full_window = true - tags = concat(["env:${var.environment}", "type:database", "provider:zookeeper", "resource:zookeeper", "team:claranet", "created-by:terraform"], var.zookeeper_latency_availability_extra_tags) + tags = concat(["env:${var.environment}", "type:database", "provider:zookeeper", "resource:zookeeper", "team:claranet", + "created-by:terraform"], var.zookeeper_latency_availability_extra_tags) lifecycle { - ignore_changes = ["silenced"] + ignore_changes = [silenced] } } diff --git a/database/zookeeper/outputs.tf b/database/zookeeper/outputs.tf index bdc52b3..f9403da 100644 --- a/database/zookeeper/outputs.tf +++ b/database/zookeeper/outputs.tf @@ -3,8 +3,8 @@ output "datadog_monitor_zookeeper_latency_id" { value = datadog_monitor.datadog_monitor_zookeeper_latency.*.id } -output "datadog_zookeeper_process_down_id" { - description = "id for monitor datadog_zookeeper_process_down" - value = datadog_monitor.datadog_zookeeper_process_down.*.id +output "not_responding_id" { + description = "id for monitor not_responding" + value = datadog_monitor.not_responding.*.id }