From 0500de1330cba0c19e4b4e39187cb8affe96452b Mon Sep 17 00:00:00 2001 From: Rafael Romero Carmona Date: Mon, 13 Aug 2018 12:15:26 +0200 Subject: [PATCH] MON-228 ES monitors --- README.md | 2 + databases/elasticsearch/README.md | 248 ++++ databases/elasticsearch/inputs.tf | 1097 +++++++++++++++++ .../elasticsearch/monitors-elasticsearch.tf | 859 +++++++++++++ databases/elasticsearch/outputs.tf | 109 ++ 5 files changed, 2315 insertions(+) create mode 100644 databases/elasticsearch/README.md create mode 100644 databases/elasticsearch/inputs.tf create mode 100644 databases/elasticsearch/monitors-elasticsearch.tf create mode 100644 databases/elasticsearch/outputs.tf diff --git a/README.md b/README.md index 1cc513d..47aa669 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [mongodb](https://bitbucket.org/morea/terraform.feature.datadog/src/master/database/mongodb/) - [mysql](https://bitbucket.org/morea/terraform.feature.datadog/src/master/database/mysql/) - [redis](https://bitbucket.org/morea/terraform.feature.datadog/src/master/database/redis/) +- [databases](https://bitbucket.org/morea/terraform.feature.datadog/src/master/databases/) + - [elasticsearch](https://bitbucket.org/morea/terraform.feature.datadog/src/master/databases/elasticsearch/) - [middleware](https://bitbucket.org/morea/terraform.feature.datadog/src/master/middleware/) - [apache](https://bitbucket.org/morea/terraform.feature.datadog/src/master/middleware/apache/) - [nginx](https://bitbucket.org/morea/terraform.feature.datadog/src/master/middleware/nginx/) diff --git a/databases/elasticsearch/README.md b/databases/elasticsearch/README.md new file mode 100644 index 0000000..d3df711 --- /dev/null +++ b/databases/elasticsearch/README.md @@ -0,0 +1,248 @@ +# DATABASES ELASTICSEARCH DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-databases-elasticsearch" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//databases/elasticsearch?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Elasticsearch average index flushing to disk latency +- Elasticsearch average indexing time by document +- Elasticsearch average Old-generation garbage collections latency +- Elasticsearch average search fetch latency +- Elasticsearch average search query latency +- Elasticsearch average Young-generation garbage collections latency +- Elasticsearch change alert on the average time spent by tasks in the queue +- Elasticsearch change alert on the number of currently active queries +- Elasticsearch change alert on the number of query cache evictions +- Elasticsearch change alert on the number of request cache evictions +- Elasticsearch change alert on the number of search fetches currently running +- Elasticsearch change alert on the total number of evictions from the fielddata cache +- ElasticSearch Cluster has unassigned shards +- ElasticSearch Cluster is initializing shards +- ElasticSearch Cluster is relocating shards +- ElasticSearch Cluster status not green +- ElasticSearch does not respond +- ElasticSearch free space < 10% +- Elasticsearch JVM HEAP memory usage +- Elasticsearch JVM memory Old usage +- Elasticsearch JVM memory Young usage +- Elasticsearch number of current open HTTP connections anomaly detected + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cluster_initializing_shards_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| cluster_initializing_shards_message | Custom message for the Cluster Status monitor | string | `` | no | +| cluster_initializing_shards_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| cluster_initializing_shards_threshold_critical | Cluster Status critical threshold | string | `2` | no | +| cluster_initializing_shards_threshold_warning | Cluster Status warning threshold | string | `1` | no | +| cluster_initializing_shards_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| cluster_initializing_shards_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| cluster_relocating_shards_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| cluster_relocating_shards_message | Custom message for the Cluster Status monitor | string | `` | no | +| cluster_relocating_shards_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| cluster_relocating_shards_threshold_critical | Cluster Status critical threshold | string | `2` | no | +| cluster_relocating_shards_threshold_warning | Cluster Status warning threshold | string | `1` | no | +| cluster_relocating_shards_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| cluster_relocating_shards_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| cluster_status_not_green_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| cluster_status_not_green_message | Custom message for the Cluster Status monitor | string | `` | no | +| cluster_status_not_green_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| cluster_status_not_green_threshold_critical | Cluster Status critical threshold | string | `0` | no | +| cluster_status_not_green_threshold_warning | Cluster Status warning threshold | string | `1` | no | +| cluster_status_not_green_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| cluster_status_not_green_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| cluster_unassigned_shards_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| cluster_unassigned_shards_message | Custom message for the Cluster Status monitor | string | `` | no | +| cluster_unassigned_shards_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| cluster_unassigned_shards_threshold_critical | Cluster Status critical threshold | string | `2` | no | +| cluster_unassigned_shards_threshold_warning | Cluster Status warning threshold | string | `1` | no | +| cluster_unassigned_shards_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| cluster_unassigned_shards_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| environment | Architecture environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `0` | no | +| fetch_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| fetch_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| fetch_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| fetch_change_threshold_critical | Cluster Status critical threshold | string | `100` | no | +| fetch_change_threshold_warning | Cluster Status warning threshold | string | `75` | no | +| fetch_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| fetch_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| fetch_change_timeshift | Timeshift for the Cluster Status monitor | string | `last_10m` | no | +| fetch_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| fetch_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| fetch_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| fetch_latency_threshold_critical | Cluster Status critical threshold | string | `4` | no | +| fetch_latency_threshold_warning | Cluster Status warning threshold | string | `2` | no | +| fetch_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| fetch_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| field_data_evictions_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| field_data_evictions_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| field_data_evictions_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| field_data_evictions_change_threshold_critical | Cluster Status critical threshold | string | `120` | no | +| field_data_evictions_change_threshold_warning | Cluster Status warning threshold | string | `60` | no | +| field_data_evictions_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| field_data_evictions_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_15m` | no | +| field_data_evictions_change_timeshift | Timeframe for the Cluster Status monitor | string | `last_15m` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| flush_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| flush_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| flush_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| flush_latency_threshold_critical | Cluster Status critical threshold | string | `100` | no | +| flush_latency_threshold_warning | Cluster Status warning threshold | string | `50` | no | +| flush_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| flush_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| http_connections_anomaly_alert_window | Alert window. | string | `last_15m` | no | +| http_connections_anomaly_count_default_zero | Count default zero. | string | `true` | no | +| http_connections_anomaly_detection_algorithm | Anomaly Detection Algorithm used | string | `agile` | no | +| http_connections_anomaly_deviations | Deviations to detect the anomaly | string | `2` | no | +| http_connections_anomaly_direction | Direction of the anomaly. It can be both, below or above. | string | `above` | no | +| http_connections_anomaly_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| http_connections_anomaly_interval | Interval. | string | `60` | no | +| http_connections_anomaly_message | Custom message for the Cluster Status monitor | string | `` | no | +| http_connections_anomaly_seasonality | Seasonality of the algorithm | string | `hourly` | no | +| http_connections_anomaly_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| http_connections_anomaly_threshold_critical | Cluster Status critical threshold | string | `1` | no | +| http_connections_anomaly_threshold_warning | Cluster Status warning threshold | string | `0.75` | no | +| http_connections_anomaly_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| http_connections_anomaly_timeframe | Timeframe for the Cluster Status monitor | string | `last_4h` | no | +| indexing_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| indexing_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| indexing_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| indexing_latency_threshold_critical | Cluster Status critical threshold | string | `15` | no | +| indexing_latency_threshold_warning | Cluster Status warning threshold | string | `10` | no | +| indexing_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| indexing_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| jvm_gc_old_collection_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| jvm_gc_old_collection_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| jvm_gc_old_collection_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| jvm_gc_old_collection_latency_threshold_critical | Cluster Status critical threshold | string | `200` | no | +| jvm_gc_old_collection_latency_threshold_warning | Cluster Status warning threshold | string | `160` | no | +| jvm_gc_old_collection_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| jvm_gc_old_collection_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| jvm_gc_young_collection_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| jvm_gc_young_collection_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| jvm_gc_young_collection_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| jvm_gc_young_collection_latency_threshold_critical | Cluster Status critical threshold | string | `25` | no | +| jvm_gc_young_collection_latency_threshold_warning | Cluster Status warning threshold | string | `20` | no | +| jvm_gc_young_collection_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| jvm_gc_young_collection_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| jvm_heap_memory_usage_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| jvm_heap_memory_usage_message | Custom message for the Cluster Status monitor | string | `` | no | +| jvm_heap_memory_usage_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| jvm_heap_memory_usage_threshold_critical | Cluster Status critical threshold | string | `90` | no | +| jvm_heap_memory_usage_threshold_warning | Cluster Status warning threshold | string | `80` | no | +| jvm_heap_memory_usage_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| jvm_heap_memory_usage_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| jvm_memory_old_usage_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| jvm_memory_old_usage_message | Custom message for the Cluster Status monitor | string | `` | no | +| jvm_memory_old_usage_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| jvm_memory_old_usage_threshold_critical | Cluster Status critical threshold | string | `90` | no | +| jvm_memory_old_usage_threshold_warning | Cluster Status warning threshold | string | `80` | no | +| jvm_memory_old_usage_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| jvm_memory_old_usage_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| jvm_memory_young_usage_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| jvm_memory_young_usage_message | Custom message for the Cluster Status monitor | string | `` | no | +| jvm_memory_young_usage_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| jvm_memory_young_usage_threshold_critical | Cluster Status critical threshold | string | `90` | no | +| jvm_memory_young_usage_threshold_warning | Cluster Status warning threshold | string | `80` | no | +| jvm_memory_young_usage_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| jvm_memory_young_usage_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| message | Message sent when a monitor is triggered | string | - | yes | +| new_host_delay | Delay in seconds for the metric evaluation | string | `300` | no | +| node_free_space_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| node_free_space_message | Custom message for the Cluster Status monitor | string | `` | no | +| node_free_space_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| node_free_space_threshold_critical | Cluster Status critical threshold | string | `10` | no | +| node_free_space_threshold_warning | Cluster Status warning threshold | string | `20` | no | +| node_free_space_time_aggregator | Time aggregator for the Cluster Status monitor | string | `sum` | no | +| node_free_space_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no | +| not_responding_by | Group by for the service check | string | `"port","server"` | no | +| not_responding_extra_tags | Extra tags for Elasticsearch does not respond monitor | list | `` | no | +| not_responding_last | Parameter 'last' for the service check | string | `1` | no | +| not_responding_message | Custom message for Elasticsearch does not respond monitor | string | `` | no | +| not_responding_silenced | Groups to mute for Elasticsearch does not respond monitor | map | `` | no | +| not_responding_threshold_critical | Not responding limit (critical threshold) | string | `5` | no | +| not_responding_threshold_warning | Not responding limit (warning threshold) | string | `0` | no | +| query_cache_evictions_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| query_cache_evictions_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| query_cache_evictions_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| query_cache_evictions_change_threshold_critical | Cluster Status critical threshold | string | `120` | no | +| query_cache_evictions_change_threshold_warning | Cluster Status warning threshold | string | `60` | no | +| query_cache_evictions_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| query_cache_evictions_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_15m` | no | +| query_cache_evictions_change_timeshift | Timeframe for the Cluster Status monitor | string | `last_15m` | no | +| request_cache_evictions_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| request_cache_evictions_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| request_cache_evictions_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| request_cache_evictions_change_threshold_critical | Cluster Status critical threshold | string | `120` | no | +| request_cache_evictions_change_threshold_warning | Cluster Status warning threshold | string | `60` | no | +| request_cache_evictions_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| request_cache_evictions_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_15m` | no | +| request_cache_evictions_change_timeshift | Timeshift for the Cluster Status monitor | string | `last_15m` | no | +| search_query_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| search_query_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| search_query_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| search_query_change_threshold_critical | Cluster Status critical threshold | string | `100` | no | +| search_query_change_threshold_warning | Cluster Status warning threshold | string | `75` | no | +| search_query_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| search_query_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| search_query_change_timeshift | Timeshift for the Cluster Status monitor | string | `last_10m` | no | +| search_query_latency_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| search_query_latency_message | Custom message for the Cluster Status monitor | string | `` | no | +| search_query_latency_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| search_query_latency_threshold_critical | Cluster Status critical threshold | string | `1` | no | +| search_query_latency_threshold_warning | Cluster Status warning threshold | string | `0.5` | no | +| search_query_latency_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| search_query_latency_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| task_time_in_queue_change_extra_tags | Extra tags for Cluster Status monitor | list | `` | no | +| task_time_in_queue_change_message | Custom message for the Cluster Status monitor | string | `` | no | +| task_time_in_queue_change_silenced | Groups to mute for Cluster Status monitor | map | `` | no | +| task_time_in_queue_change_threshold_critical | Cluster Status critical threshold | string | `200` | no | +| task_time_in_queue_change_threshold_warning | Cluster Status warning threshold | string | `100` | no | +| task_time_in_queue_change_time_aggregator | Time aggregator for the Cluster Status monitor | string | `avg` | no | +| task_time_in_queue_change_timeframe | Timeframe for the Cluster Status monitor | string | `last_10m` | no | +| task_time_in_queue_change_timeshift | Timeshift for the Cluster Status monitor | string | `last_10m` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| cluster_initializing_shards_id | id for monitor cluster_initializing_shards | +| cluster_relocating_shards_id | id for monitor cluster_relocating_shards | +| cluster_status_not_green_id | id for monitor cluster_status_not_green | +| cluster_unassigned_shards_id | id for monitor cluster_unassigned_shards | +| fetch_change_id | id for monitor fetch_change | +| fetch_latency_id | id for monitor fetch_latency | +| field_data_evictions_change_id | id for monitor field_data_evictions_change | +| flush_latency_id | id for monitor flush_latency | +| http_connections_anomaly_id | id for monitor http_connections_anomaly | +| indexing_latency_id | id for monitor indexing_latency | +| jvm_gc_old_collection_latency_id | id for monitor jvm_gc_old_collection_latency | +| jvm_gc_young_collection_latency_id | id for monitor jvm_gc_young_collection_latency | +| jvm_heap_memory_usage_id | id for monitor jvm_heap_memory_usage | +| jvm_memory_old_usage_id | id for monitor jvm_memory_old_usage | +| jvm_memory_young_usage_id | id for monitor jvm_memory_young_usage | +| node_free_space_id | id for monitor node_free_space | +| not_responding_id | id for monitor not_responding | +| query_cache_evictions_change_id | id for monitor query_cache_evictions_change | +| request_cache_evictions_change_id | id for monitor request_cache_evictions_change | +| search_query_change_id | id for monitor search_query_change | +| search_query_latency_id | id for monitor search_query_latency | +| task_time_in_queue_change_id | id for monitor task_time_in_queue_change | + +## Related documentation + diff --git a/databases/elasticsearch/inputs.tf b/databases/elasticsearch/inputs.tf new file mode 100644 index 0000000..e4c8f21 --- /dev/null +++ b/databases/elasticsearch/inputs.tf @@ -0,0 +1,1097 @@ +# +# Datadog global variables +# +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 0 +} + +variable "new_host_delay" { + description = "Delay in seconds for the metric evaluation" + default = 300 +} + +# +# Cluster Status Not Green +# +variable "cluster_status_not_green_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "cluster_status_not_green_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "cluster_status_not_green_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "cluster_status_not_green_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 1 +} + +variable "cluster_status_not_green_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 0 +} + +variable "cluster_status_not_green_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "cluster_status_not_green_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Cluster Initializing Shards +# +variable "cluster_initializing_shards_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "cluster_initializing_shards_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "cluster_initializing_shards_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "cluster_initializing_shards_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 1 +} + +variable "cluster_initializing_shards_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 2 +} + +variable "cluster_initializing_shards_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "cluster_initializing_shards_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Cluster Relocating Shards +# +variable "cluster_relocating_shards_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "cluster_relocating_shards_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "cluster_relocating_shards_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "cluster_relocating_shards_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 1 +} + +variable "cluster_relocating_shards_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 2 +} + +variable "cluster_relocating_shards_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "cluster_relocating_shards_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Cluster Unassigned Shards +# +variable "cluster_unassigned_shards_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "cluster_unassigned_shards_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "cluster_unassigned_shards_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "cluster_unassigned_shards_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 1 +} + +variable "cluster_unassigned_shards_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 2 +} + +variable "cluster_unassigned_shards_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "cluster_unassigned_shards_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Free Space in nodes +# +variable "node_free_space_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "node_free_space_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "sum" +} + +variable "node_free_space_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "node_free_space_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 20 +} + +variable "node_free_space_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 10 +} + +variable "node_free_space_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "node_free_space_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# JVM Heap Memory Usage +# +variable "jvm_heap_memory_usage_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "jvm_heap_memory_usage_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "jvm_heap_memory_usage_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_5m" +} + +variable "jvm_heap_memory_usage_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 80 +} + +variable "jvm_heap_memory_usage_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 90 +} + +variable "jvm_heap_memory_usage_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "jvm_heap_memory_usage_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# JVM Memory Young Usage +# +variable "jvm_memory_young_usage_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "jvm_memory_young_usage_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "jvm_memory_young_usage_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "jvm_memory_young_usage_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 80 +} + +variable "jvm_memory_young_usage_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 90 +} + +variable "jvm_memory_young_usage_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "jvm_memory_young_usage_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# JVM Memory Old Usage +# +variable "jvm_memory_old_usage_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "jvm_memory_old_usage_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "jvm_memory_old_usage_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "jvm_memory_old_usage_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 80 +} + +variable "jvm_memory_old_usage_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 90 +} + +variable "jvm_memory_old_usage_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "jvm_memory_old_usage_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# JVM Garbace Collector Old Collection Latency +# +variable "jvm_gc_old_collection_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "jvm_gc_old_collection_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "jvm_gc_old_collection_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "jvm_gc_old_collection_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 160 +} + +variable "jvm_gc_old_collection_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 200 +} + +variable "jvm_gc_old_collection_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "jvm_gc_old_collection_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# JVM Garbace Collector Young Collection Latency +# +variable "jvm_gc_young_collection_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "jvm_gc_young_collection_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "jvm_gc_young_collection_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "jvm_gc_young_collection_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 20 +} + +variable "jvm_gc_young_collection_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 25 +} + +variable "jvm_gc_young_collection_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "jvm_gc_young_collection_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Indexing Latency +# +variable "indexing_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "indexing_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "indexing_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "indexing_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 10 +} + +variable "indexing_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 15 +} + +variable "indexing_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "indexing_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Flush Latency +# +variable "flush_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "flush_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "flush_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "flush_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 50 +} + +variable "flush_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 100 +} + +variable "flush_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "flush_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Open HTTP Connections Anomaly +# +variable "http_connections_anomaly_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "http_connections_anomaly_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "http_connections_anomaly_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_4h" +} + +variable "http_connections_anomaly_detection_algorithm" { + description = "Anomaly Detection Algorithm used" + type = "string" + default = "agile" +} + +variable "http_connections_anomaly_deviations" { + description = "Deviations to detect the anomaly" + type = "string" + default = 2 +} + +variable "http_connections_anomaly_direction" { + description = "Direction of the anomaly. It can be both, below or above." + type = "string" + default = "above" +} + +variable "http_connections_anomaly_alert_window" { + description = "Alert window." + type = "string" + default = "last_15m" +} + +variable "http_connections_anomaly_interval" { + description = "Interval." + type = "string" + default = 60 +} + +variable "http_connections_anomaly_count_default_zero" { + description = "Count default zero." + type = "string" + default = "true" +} + +variable "http_connections_anomaly_seasonality" { + description = "Seasonality of the algorithm" + type = "string" + default = "hourly" +} + +variable "http_connections_anomaly_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 0.75 +} + +variable "http_connections_anomaly_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 1 +} + +variable "http_connections_anomaly_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "http_connections_anomaly_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Query Latency +# +variable "search_query_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "search_query_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "search_query_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "search_query_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 0.5 +} + +variable "search_query_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 1 +} + +variable "search_query_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "search_query_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Fetch Latency +# +variable "fetch_latency_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "fetch_latency_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "fetch_latency_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "fetch_latency_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 2 +} + +variable "fetch_latency_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 4 +} + +variable "fetch_latency_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "fetch_latency_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Search Query Change +# +variable "search_query_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "search_query_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "search_query_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "search_query_change_timeshift" { + description = "Timeshift for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "search_query_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 75 +} + +variable "search_query_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 100 +} + +variable "search_query_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "search_query_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Fetch Change +# +variable "fetch_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "fetch_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "fetch_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "fetch_change_timeshift" { + description = "Timeshift for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "fetch_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 75 +} + +variable "fetch_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 100 +} + +variable "fetch_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "fetch_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Field Data Evictions +# +variable "field_data_evictions_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "field_data_evictions_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "field_data_evictions_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "field_data_evictions_change_timeshift" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "field_data_evictions_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 60 +} + +variable "field_data_evictions_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 120 +} + +variable "field_data_evictions_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "field_data_evictions_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Query Cache Evictions +# +variable "query_cache_evictions_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "query_cache_evictions_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "query_cache_evictions_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "query_cache_evictions_change_timeshift" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "query_cache_evictions_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 60 +} + +variable "query_cache_evictions_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 120 +} + +variable "query_cache_evictions_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "query_cache_evictions_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Request Cache Evictions +# +variable "request_cache_evictions_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "request_cache_evictions_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "request_cache_evictions_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "request_cache_evictions_change_timeshift" { + description = "Timeshift for the Cluster Status monitor" + type = "string" + default = "last_15m" +} + +variable "request_cache_evictions_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 60 +} + +variable "request_cache_evictions_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 120 +} + +variable "request_cache_evictions_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "request_cache_evictions_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Task Time in Queue +# +variable "task_time_in_queue_change_message" { + description = "Custom message for the Cluster Status monitor" + type = "string" + default = "" +} + +variable "task_time_in_queue_change_time_aggregator" { + description = "Time aggregator for the Cluster Status monitor" + type = "string" + default = "avg" +} + +variable "task_time_in_queue_change_timeframe" { + description = "Timeframe for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "task_time_in_queue_change_timeshift" { + description = "Timeshift for the Cluster Status monitor" + type = "string" + default = "last_10m" +} + +variable "task_time_in_queue_change_threshold_warning" { + description = "Cluster Status warning threshold" + type = "string" + default = 100 +} + +variable "task_time_in_queue_change_threshold_critical" { + description = "Cluster Status critical threshold" + type = "string" + default = 200 +} + +variable "task_time_in_queue_change_silenced" { + description = "Groups to mute for Cluster Status monitor" + type = "map" + default = {} +} + +variable "task_time_in_queue_change_extra_tags" { + description = "Extra tags for Cluster Status monitor" + type = "list" + default = [] +} + +# +# Service Check +# +variable "not_responding_silenced" { + description = "Groups to mute for Elasticsearch does not respond monitor" + type = "map" + default = {} +} + +variable "not_responding_message" { + description = "Custom message for Elasticsearch does not respond monitor" + type = "string" + default = "" +} + +variable "not_responding_by" { + description = "Group by for the service check" + type = "string" + default = "\"port\",\"server\"" +} + +variable "not_responding_last" { + description = "Parameter 'last' for the service check" + type = "string" + default = 1 +} + +variable "not_responding_threshold_critical" { + description = "Not responding limit (critical threshold)" + default = 5 +} + +variable "not_responding_threshold_warning" { + description = "Not responding limit (warning threshold)" + default = 0 +} + +variable "not_responding_extra_tags" { + description = "Extra tags for Elasticsearch does not respond monitor" + type = "list" + default = [] +} diff --git a/databases/elasticsearch/monitors-elasticsearch.tf b/databases/elasticsearch/monitors-elasticsearch.tf new file mode 100644 index 0000000..6acb793 --- /dev/null +++ b/databases/elasticsearch/monitors-elasticsearch.tf @@ -0,0 +1,859 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +# +# Cluster Status Not Green +# +resource "datadog_monitor" "cluster_status_not_green" { + name = "[${var.environment}] ElasticSearch Cluster status not green" + message = "${coalesce(var.cluster_status_not_green_message, var.message)}" + + type = "metric alert" + + query = < ${var.cluster_initializing_shards_threshold_critical} +EOF + + thresholds { + warning = "${var.cluster_initializing_shards_threshold_warning}" + critical = "${var.cluster_initializing_shards_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.cluster_initializing_shards_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.cluster_initializing_shards_extra_tags}", + ] +} + +# +# Cluster Relocating Shards +# +resource "datadog_monitor" "cluster_relocating_shards" { + name = "[${var.environment}] ElasticSearch Cluster is relocating shards" + message = "${coalesce(var.cluster_relocating_shards_message, var.message)}" + + type = "metric alert" + + query = < ${var.cluster_relocating_shards_threshold_critical} +EOF + + thresholds { + warning = "${var.cluster_relocating_shards_threshold_warning}" + critical = "${var.cluster_relocating_shards_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.cluster_relocating_shards_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.cluster_relocating_shards_extra_tags}", + ] +} + +# +# Cluster Unassigned Shards +# +resource "datadog_monitor" "cluster_unassigned_shards" { + name = "[${var.environment}] ElasticSearch Cluster has unassigned shards" + message = "${coalesce(var.cluster_unassigned_shards_message, var.message)}" + + type = "metric alert" + + query = < ${var.cluster_unassigned_shards_threshold_critical} +EOF + + thresholds { + warning = "${var.cluster_unassigned_shards_threshold_warning}" + critical = "${var.cluster_unassigned_shards_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.cluster_unassigned_shards_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.cluster_unassigned_shards_extra_tags}", + ] +} + +# +# Free Space in nodes +# +resource "datadog_monitor" "node_free_space" { + name = "[${var.environment}] ElasticSearch free space < 10%" + message = "${coalesce(var.node_free_space_message, var.message)}" + + type = "query alert" + + query = < ${var.jvm_heap_memory_usage_threshold_critical} +EOF + + thresholds { + warning = "${var.jvm_heap_memory_usage_threshold_warning}" + critical = "${var.jvm_heap_memory_usage_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.jvm_heap_memory_usage_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.jvm_heap_memory_usage_extra_tags}", + ] +} + +# +# JVM Memory Young Usage +# +resource "datadog_monitor" "jvm_memory_young_usage" { + name = "[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.jvm_memory_young_usage_message, var.message)}" + + type = "query alert" + + query = < ${var.jvm_memory_young_usage_threshold_critical} +EOF + + thresholds { + warning = "${var.jvm_memory_young_usage_threshold_warning}" + critical = "${var.jvm_memory_young_usage_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.jvm_memory_young_usage_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.jvm_memory_young_usage_extra_tags}", + ] +} + +# +# JVM Memory Old Usage +# +resource "datadog_monitor" "jvm_memory_old_usage" { + name = "[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.jvm_memory_old_usage_message, var.message)}" + + type = "query alert" + + query = < ${var.jvm_memory_old_usage_threshold_critical} +EOF + + thresholds { + warning = "${var.jvm_memory_old_usage_threshold_warning}" + critical = "${var.jvm_memory_old_usage_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.jvm_memory_old_usage_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.jvm_memory_old_usage_extra_tags}", + ] +} + +# +# JVM Garbace Collector Old Collection Latency +# +resource "datadog_monitor" "jvm_gc_old_collection_latency" { + name = "[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.jvm_gc_old_collection_latency_message, var.message)}" + + type = "query alert" + + query = < ${var.jvm_gc_old_collection_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.jvm_gc_old_collection_latency_threshold_warning}" + critical = "${var.jvm_gc_old_collection_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.jvm_gc_old_collection_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.jvm_gc_old_collection_latency_extra_tags}", + ] +} + +# +# JVM Garbace Collector Young Collection Latency +# +resource "datadog_monitor" "jvm_gc_young_collection_latency" { + name = "[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.jvm_gc_young_collection_latency_message, var.message)}" + + type = "query alert" + + query = < ${var.jvm_gc_young_collection_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.jvm_gc_young_collection_latency_threshold_warning}" + critical = "${var.jvm_gc_young_collection_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.jvm_gc_young_collection_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.jvm_gc_young_collection_latency_extra_tags}", + ] +} + +# +# Indexing Latency +# +resource "datadog_monitor" "indexing_latency" { + name = "[${var.environment}] Elasticsearch average indexing time by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.indexing_latency_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.indexing_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.indexing_latency_threshold_warning}" + critical = "${var.indexing_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.indexing_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.indexing_latency_extra_tags}", + ] +} + +# +# Flush Latency +# +resource "datadog_monitor" "flush_latency" { + name = "[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.flush_latency_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.flush_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.flush_latency_threshold_warning}" + critical = "${var.flush_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.flush_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.flush_latency_extra_tags}", + ] +} + +# +# Open HTTP Connections Anomaly +# +resource "datadog_monitor" "http_connections_anomaly" { + name = "[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected" + message = "${coalesce(var.http_connections_anomaly_message, var.message)}" + + type = "query alert" + + query = <= ${var.http_connections_anomaly_threshold_critical} +EOF + + thresholds { + warning = "${var.http_connections_anomaly_threshold_warning}" + critical = "${var.http_connections_anomaly_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.http_connections_anomaly_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.http_connections_anomaly_extra_tags}", + ] +} + +# +# Query Latency +# +resource "datadog_monitor" "search_query_latency" { + name = "[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.search_query_latency_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.search_query_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.search_query_latency_threshold_warning}" + critical = "${var.search_query_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.search_query_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.search_query_latency_extra_tags}", + ] +} + +# +# Fetch Latency +# +resource "datadog_monitor" "fetch_latency" { + name = "[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" + message = "${coalesce(var.fetch_latency_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.fetch_latency_threshold_critical} +EOF + + thresholds { + warning = "${var.fetch_latency_threshold_warning}" + critical = "${var.fetch_latency_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.fetch_latency_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.fetch_latency_extra_tags}", + ] +} + +# +# Search Query Change +# +resource "datadog_monitor" "search_query_change" { + name = "[${var.environment}] Elasticsearch change alert on the number of currently active queries" + message = "${coalesce(var.search_query_change_message, var.message)}" + + type = "query alert" + + query = <= ${var.search_query_change_threshold_critical} +EOF + + thresholds { + warning = "${var.search_query_change_threshold_warning}" + critical = "${var.search_query_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.search_query_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.search_query_change_extra_tags}", + ] +} + +# +# Fetch Change +# +resource "datadog_monitor" "fetch_change" { + name = "[${var.environment}] Elasticsearch change alert on the number of search fetches currently running" + message = "${coalesce(var.fetch_change_message, var.message)}" + + type = "query alert" + + query = <= ${var.fetch_change_threshold_critical} +EOF + + thresholds { + warning = "${var.fetch_change_threshold_warning}" + critical = "${var.fetch_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.fetch_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.fetch_change_extra_tags}", + ] +} + +# +# Field Data Evictions +# +resource "datadog_monitor" "field_data_evictions_change" { + name = "[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache" + message = "${coalesce(var.field_data_evictions_change_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.field_data_evictions_change_threshold_critical} +EOF + + thresholds { + warning = "${var.field_data_evictions_change_threshold_warning}" + critical = "${var.field_data_evictions_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.field_data_evictions_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.field_data_evictions_change_extra_tags}", + ] +} + +# +# Query Cache Evictions +# +resource "datadog_monitor" "query_cache_evictions_change" { + name = "[${var.environment}] Elasticsearch change alert on the number of query cache evictions" + message = "${coalesce(var.query_cache_evictions_change_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.query_cache_evictions_change_threshold_critical} +EOF + + thresholds { + warning = "${var.query_cache_evictions_change_threshold_warning}" + critical = "${var.query_cache_evictions_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.query_cache_evictions_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.query_cache_evictions_change_extra_tags}", + ] +} + +# +# Request Cache Evictions +# +resource "datadog_monitor" "request_cache_evictions_change" { + name = "[${var.environment}] Elasticsearch change alert on the number of request cache evictions" + message = "${coalesce(var.request_cache_evictions_change_message, var.message)}" + + type = "query alert" + + // TODO add tags to filter by node type and do not apply this monitor on non-data nodes + query = < ${var.request_cache_evictions_change_threshold_critical} +EOF + + thresholds { + warning = "${var.request_cache_evictions_change_threshold_warning}" + critical = "${var.request_cache_evictions_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.request_cache_evictions_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.request_cache_evictions_change_extra_tags}", + ] +} + +# +# Task Time in Queue +# +resource "datadog_monitor" "task_time_in_queue_change" { + name = "[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue" + message = "${coalesce(var.task_time_in_queue_change_message, var.message)}" + + type = "query alert" + + query = < ${var.task_time_in_queue_change_threshold_critical} +EOF + + thresholds { + warning = "${var.task_time_in_queue_change_threshold_warning}" + critical = "${var.task_time_in_queue_change_threshold_critical}" + } + + notify_audit = false + locked = false + include_tags = true + require_full_window = true + notify_no_data = true + + evaluation_delay = "${var.evaluation_delay}" + + silenced = "${var.task_time_in_queue_change_silenced}" + + tags = [ + "resource:elasticsearch", + "env:${var.environment}", + "created_by:terraform", + "${var.task_time_in_queue_change_extra_tags}", + ] +} + +# +# Service Check +# +resource "datadog_monitor" "not_responding" { + name = "[${var.environment}] ElasticSearch does not respond" + message = "${coalesce(var.not_responding_message, var.message)}" + + query = <