# # Service Check # resource "datadog_monitor" "not_responding" { count = "${var.not_responding_enabled == "true" ? 1 : 0}" name = "[${var.environment}] ElasticSearch does not respond" message = "${coalesce(var.not_responding_message, var.message)}" query = < ${var.cluster_initializing_shards_threshold_critical} EOQ thresholds { warning = "${var.cluster_initializing_shards_threshold_warning}" critical = "${var.cluster_initializing_shards_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.cluster_initializing_shards_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.cluster_initializing_shards_extra_tags}", ] } # # Cluster Relocating Shards # resource "datadog_monitor" "cluster_relocating_shards" { count = "${var.cluster_relocating_shards_enabled == "true" ? 1 : 0}" name = "[${var.environment}] ElasticSearch Cluster is relocating shards" message = "${coalesce(var.cluster_relocating_shards_message, var.message)}" type = "metric alert" query = < ${var.cluster_relocating_shards_threshold_critical} EOQ thresholds { warning = "${var.cluster_relocating_shards_threshold_warning}" critical = "${var.cluster_relocating_shards_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.cluster_relocating_shards_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.cluster_relocating_shards_extra_tags}", ] } # # Cluster Unassigned Shards # resource "datadog_monitor" "cluster_unassigned_shards" { count = "${var.cluster_unassigned_shards_enabled == "true" ? 1 : 0}" name = "[${var.environment}] ElasticSearch Cluster has unassigned shards" message = "${coalesce(var.cluster_unassigned_shards_message, var.message)}" type = "metric alert" query = < ${var.cluster_unassigned_shards_threshold_critical} EOQ thresholds { warning = "${var.cluster_unassigned_shards_threshold_warning}" critical = "${var.cluster_unassigned_shards_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.cluster_unassigned_shards_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.cluster_unassigned_shards_extra_tags}", ] } # # Free Space in nodes # resource "datadog_monitor" "node_free_space" { count = "${var.node_free_space_enabled == "true" ? 1 : 0}" name = "[${var.environment}] ElasticSearch free space < 10%" message = "${coalesce(var.node_free_space_message, var.message)}" type = "query alert" query = < ${var.jvm_heap_memory_usage_threshold_critical} EOQ thresholds { warning = "${var.jvm_heap_memory_usage_threshold_warning}" critical = "${var.jvm_heap_memory_usage_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.jvm_heap_memory_usage_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.jvm_heap_memory_usage_extra_tags}", ] } # # JVM Memory Young Usage # resource "datadog_monitor" "jvm_memory_young_usage" { count = "${var.jvm_memory_young_usage_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.jvm_memory_young_usage_message, var.message)}" type = "query alert" query = < ${var.jvm_memory_young_usage_threshold_critical} EOQ thresholds { warning = "${var.jvm_memory_young_usage_threshold_warning}" critical = "${var.jvm_memory_young_usage_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.jvm_memory_young_usage_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.jvm_memory_young_usage_extra_tags}", ] } # # JVM Memory Old Usage # resource "datadog_monitor" "jvm_memory_old_usage" { count = "${var.jvm_memory_old_usage_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = "${coalesce(var.jvm_memory_old_usage_message, var.message)}" type = "query alert" query = < ${var.jvm_memory_old_usage_threshold_critical} EOQ thresholds { warning = "${var.jvm_memory_old_usage_threshold_warning}" critical = "${var.jvm_memory_old_usage_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.jvm_memory_old_usage_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.jvm_memory_old_usage_extra_tags}", ] } # # JVM Garbace Collector Old Collection Latency # resource "datadog_monitor" "jvm_gc_old_collection_latency" { count = "${var.jvm_gc_old_collection_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.jvm_gc_old_collection_latency_message, var.message)}" type = "query alert" query = < ${var.jvm_gc_old_collection_latency_threshold_critical} EOQ thresholds { warning = "${var.jvm_gc_old_collection_latency_threshold_warning}" critical = "${var.jvm_gc_old_collection_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.jvm_gc_old_collection_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.jvm_gc_old_collection_latency_extra_tags}", ] } # # JVM Garbace Collector Young Collection Latency # resource "datadog_monitor" "jvm_gc_young_collection_latency" { count = "${var.jvm_gc_young_collection_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.jvm_gc_young_collection_latency_message, var.message)}" type = "query alert" query = < ${var.jvm_gc_young_collection_latency_threshold_critical} EOQ thresholds { warning = "${var.jvm_gc_young_collection_latency_threshold_warning}" critical = "${var.jvm_gc_young_collection_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.jvm_gc_young_collection_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.jvm_gc_young_collection_latency_extra_tags}", ] } # # Indexing Latency # resource "datadog_monitor" "indexing_latency" { count = "${var.indexing_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average indexing time by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.indexing_latency_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.indexing_latency_threshold_critical} EOQ thresholds { warning = "${var.indexing_latency_threshold_warning}" critical = "${var.indexing_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.indexing_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.indexing_latency_extra_tags}", ] } # # Flush Latency # resource "datadog_monitor" "flush_latency" { count = "${var.flush_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.flush_latency_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.flush_latency_threshold_critical} EOQ thresholds { warning = "${var.flush_latency_threshold_warning}" critical = "${var.flush_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.flush_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.flush_latency_extra_tags}", ] } # # Open HTTP Connections Anomaly # resource "datadog_monitor" "http_connections_anomaly" { count = "${var.http_connections_anomaly_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected" message = "${coalesce(var.http_connections_anomaly_message, var.message)}" type = "query alert" query = <= ${var.http_connections_anomaly_threshold_critical} EOQ thresholds { warning = "${var.http_connections_anomaly_threshold_warning}" critical = "${var.http_connections_anomaly_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.http_connections_anomaly_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.http_connections_anomaly_extra_tags}", ] } # # Query Latency # resource "datadog_monitor" "search_query_latency" { count = "${var.search_query_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.search_query_latency_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.search_query_latency_threshold_critical} EOQ thresholds { warning = "${var.search_query_latency_threshold_warning}" critical = "${var.search_query_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.search_query_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.search_query_latency_extra_tags}", ] } # # Fetch Latency # resource "datadog_monitor" "fetch_latency" { count = "${var.fetch_latency_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = "${coalesce(var.fetch_latency_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.fetch_latency_threshold_critical} EOQ thresholds { warning = "${var.fetch_latency_threshold_warning}" critical = "${var.fetch_latency_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.fetch_latency_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.fetch_latency_extra_tags}", ] } # # Search Query Change # resource "datadog_monitor" "search_query_change" { count = "${var.search_query_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the number of currently active queries" message = "${coalesce(var.search_query_change_message, var.message)}" type = "query alert" query = <= ${var.search_query_change_threshold_critical} EOQ thresholds { warning = "${var.search_query_change_threshold_warning}" critical = "${var.search_query_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.search_query_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.search_query_change_extra_tags}", ] } # # Fetch Change # resource "datadog_monitor" "fetch_change" { count = "${var.fetch_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the number of search fetches currently running" message = "${coalesce(var.fetch_change_message, var.message)}" type = "query alert" query = <= ${var.fetch_change_threshold_critical} EOQ thresholds { warning = "${var.fetch_change_threshold_warning}" critical = "${var.fetch_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.fetch_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.fetch_change_extra_tags}", ] } # # Field Data Evictions # resource "datadog_monitor" "field_data_evictions_change" { count = "${var.field_data_evictions_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache" message = "${coalesce(var.field_data_evictions_change_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.field_data_evictions_change_threshold_critical} EOQ thresholds { warning = "${var.field_data_evictions_change_threshold_warning}" critical = "${var.field_data_evictions_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.field_data_evictions_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.field_data_evictions_change_extra_tags}", ] } # # Query Cache Evictions # resource "datadog_monitor" "query_cache_evictions_change" { count = "${var.query_cache_evictions_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the number of query cache evictions" message = "${coalesce(var.query_cache_evictions_change_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.query_cache_evictions_change_threshold_critical} EOQ thresholds { warning = "${var.query_cache_evictions_change_threshold_warning}" critical = "${var.query_cache_evictions_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.query_cache_evictions_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.query_cache_evictions_change_extra_tags}", ] } # # Request Cache Evictions # resource "datadog_monitor" "request_cache_evictions_change" { count = "${var.request_cache_evictions_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the number of request cache evictions" message = "${coalesce(var.request_cache_evictions_change_message, var.message)}" type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.request_cache_evictions_change_threshold_critical} EOQ thresholds { warning = "${var.request_cache_evictions_change_threshold_warning}" critical = "${var.request_cache_evictions_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.request_cache_evictions_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.request_cache_evictions_change_extra_tags}", ] } # # Task Time in Queue # resource "datadog_monitor" "task_time_in_queue_change" { count = "${var.task_time_in_queue_change_enabled == "true" ? 1 : 0}" name = "[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue" message = "${coalesce(var.task_time_in_queue_change_message, var.message)}" type = "query alert" query = < ${var.task_time_in_queue_change_threshold_critical} EOQ thresholds { warning = "${var.task_time_in_queue_change_threshold_warning}" critical = "${var.task_time_in_queue_change_threshold_critical}" } notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false evaluation_delay = "${var.evaluation_delay}" silenced = "${var.task_time_in_queue_change_silenced}" tags = [ "resource:elasticsearch", "env:${var.environment}", "created-by:terraform", "team:claranet", "type:database", "provider:elasticsearch", "${var.task_time_in_queue_change_extra_tags}", ] }