# # Service Check # resource "datadog_monitor" "not_responding" { count = var.not_responding_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch does not respond" message = coalesce(var.not_responding_message, var.message) type = "service check" query = < ${var.cluster_initializing_shards_threshold_critical} EOQ thresholds = { warning = var.cluster_initializing_shards_threshold_warning critical = var.cluster_initializing_shards_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.cluster_initializing_shards_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Cluster Relocating Shards # resource "datadog_monitor" "cluster_relocating_shards" { count = var.cluster_relocating_shards_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster is relocating shards" message = coalesce(var.cluster_relocating_shards_message, var.message) type = "metric alert" query = < ${var.cluster_relocating_shards_threshold_critical} EOQ thresholds = { warning = var.cluster_relocating_shards_threshold_warning critical = var.cluster_relocating_shards_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.cluster_relocating_shards_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Cluster Unassigned Shards # resource "datadog_monitor" "cluster_unassigned_shards" { count = var.cluster_unassigned_shards_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch Cluster has unassigned shards" message = coalesce(var.cluster_unassigned_shards_message, var.message) type = "metric alert" query = < ${var.cluster_unassigned_shards_threshold_critical} EOQ thresholds = { warning = var.cluster_unassigned_shards_threshold_warning critical = var.cluster_unassigned_shards_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.cluster_unassigned_shards_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Free Space in nodes # resource "datadog_monitor" "node_free_space" { count = var.node_free_space_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] ElasticSearch free space < 10%" message = coalesce(var.node_free_space_message, var.message) type = "query alert" query = < ${var.jvm_heap_memory_usage_threshold_critical} EOQ thresholds = { warning = var.jvm_heap_memory_usage_threshold_warning critical = var.jvm_heap_memory_usage_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.jvm_heap_memory_usage_extra_tags) lifecycle { ignore_changes = [silenced] } } # # JVM Memory Young Usage # resource "datadog_monitor" "jvm_memory_young_usage" { count = var.jvm_memory_young_usage_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Young usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.jvm_memory_young_usage_message, var.message) type = "query alert" query = < ${var.jvm_memory_young_usage_threshold_critical} EOQ thresholds = { warning = var.jvm_memory_young_usage_threshold_warning critical = var.jvm_memory_young_usage_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.jvm_memory_young_usage_extra_tags) lifecycle { ignore_changes = [silenced] } } # # JVM Memory Old Usage # resource "datadog_monitor" "jvm_memory_old_usage" { count = var.jvm_memory_old_usage_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch JVM memory Old usage {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" message = coalesce(var.jvm_memory_old_usage_message, var.message) type = "query alert" query = < ${var.jvm_memory_old_usage_threshold_critical} EOQ thresholds = { warning = var.jvm_memory_old_usage_threshold_warning critical = var.jvm_memory_old_usage_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.jvm_memory_old_usage_extra_tags) lifecycle { ignore_changes = [silenced] } } # # JVM Garbace Collector Old Collection Latency # resource "datadog_monitor" "jvm_gc_old_collection_latency" { count = var.jvm_gc_old_collection_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Old-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.jvm_gc_old_collection_latency_message, var.message) type = "query alert" query = < ${var.jvm_gc_old_collection_latency_threshold_critical} EOQ thresholds = { warning = var.jvm_gc_old_collection_latency_threshold_warning critical = var.jvm_gc_old_collection_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.jvm_gc_old_collection_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # JVM Garbace Collector Young Collection Latency # resource "datadog_monitor" "jvm_gc_young_collection_latency" { count = var.jvm_gc_young_collection_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average Young-generation garbage collections latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.jvm_gc_young_collection_latency_message, var.message) type = "query alert" query = < ${var.jvm_gc_young_collection_latency_threshold_critical} EOQ thresholds = { warning = var.jvm_gc_young_collection_latency_threshold_warning critical = var.jvm_gc_young_collection_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.jvm_gc_young_collection_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Indexing Latency # resource "datadog_monitor" "indexing_latency" { count = var.indexing_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average indexing latency by document {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.indexing_latency_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.indexing_latency_threshold_critical} EOQ thresholds = { warning = var.indexing_latency_threshold_warning critical = var.indexing_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.indexing_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Flush Latency # resource "datadog_monitor" "flush_latency" { count = var.flush_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average index flushing to disk latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.flush_latency_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.flush_latency_threshold_critical} EOQ thresholds = { warning = var.flush_latency_threshold_warning critical = var.flush_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.flush_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Open HTTP Connections Anomaly # resource "datadog_monitor" "http_connections_anomaly" { count = var.http_connections_anomaly_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch number of current open HTTP connections anomaly detected" message = coalesce(var.http_connections_anomaly_message, var.message) type = "query alert" query = <= ${var.http_connections_anomaly_threshold_critical} EOQ thresholds = { warning = var.http_connections_anomaly_threshold_warning critical = var.http_connections_anomaly_threshold_critical } threshold_windows = { trigger_window = var.http_connections_anomaly_alert_window recovery_window = var.http_connections_anomaly_alert_window } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.http_connections_anomaly_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Query Latency # resource "datadog_monitor" "search_query_latency" { count = var.search_query_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search query latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.search_query_latency_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.search_query_latency_threshold_critical} EOQ thresholds = { warning = var.search_query_latency_threshold_warning critical = var.search_query_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.search_query_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Fetch Latency # resource "datadog_monitor" "fetch_latency" { count = var.fetch_latency_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch average search fetch latency {{#is_alert}}{{{comparator}}} {{threshold}}ms ({{value}}ms){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}ms ({{value}}ms){{/is_warning}}" message = coalesce(var.fetch_latency_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.fetch_latency_threshold_critical} EOQ thresholds = { warning = var.fetch_latency_threshold_warning critical = var.fetch_latency_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.fetch_latency_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Search Query Change # resource "datadog_monitor" "search_query_change" { count = var.search_query_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of currently active queries" message = coalesce(var.search_query_change_message, var.message) type = "query alert" query = <= ${var.search_query_change_threshold_critical} EOQ thresholds = { warning = var.search_query_change_threshold_warning critical = var.search_query_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.search_query_change_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Fetch Change # resource "datadog_monitor" "fetch_change" { count = var.fetch_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of search fetches currently running" message = coalesce(var.fetch_change_message, var.message) type = "query alert" query = <= ${var.fetch_change_threshold_critical} EOQ thresholds = { warning = var.fetch_change_threshold_warning critical = var.fetch_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.fetch_change_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Field Data Evictions # resource "datadog_monitor" "field_data_evictions_change" { count = var.field_data_evictions_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the total number of evictions from the fielddata cache" message = coalesce(var.field_data_evictions_change_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.field_data_evictions_change_threshold_critical} EOQ thresholds = { warning = var.field_data_evictions_change_threshold_warning critical = var.field_data_evictions_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.field_data_evictions_change_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Query Cache Evictions # resource "datadog_monitor" "query_cache_evictions_change" { count = var.query_cache_evictions_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of query cache evictions" message = coalesce(var.query_cache_evictions_change_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.query_cache_evictions_change_threshold_critical} EOQ thresholds = { warning = var.query_cache_evictions_change_threshold_warning critical = var.query_cache_evictions_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.query_cache_evictions_change_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Request Cache Evictions # resource "datadog_monitor" "request_cache_evictions_change" { count = var.request_cache_evictions_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the number of request cache evictions" message = coalesce(var.request_cache_evictions_change_message, var.message) type = "query alert" // TODO add tags to filter by node type and do not apply this monitor on non-data nodes query = < ${var.request_cache_evictions_change_threshold_critical} EOQ thresholds = { warning = var.request_cache_evictions_change_threshold_warning critical = var.request_cache_evictions_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.request_cache_evictions_change_extra_tags) lifecycle { ignore_changes = [silenced] } } # # Task Time in Queue # resource "datadog_monitor" "task_time_in_queue_change" { count = var.task_time_in_queue_change_enabled == "true" ? 1 : 0 name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Elasticsearch change alert on the average time spent by tasks in the queue" message = coalesce(var.task_time_in_queue_change_message, var.message) type = "query alert" query = < ${var.task_time_in_queue_change_threshold_critical} EOQ thresholds = { warning = var.task_time_in_queue_change_threshold_warning critical = var.task_time_in_queue_change_threshold_critical } evaluation_delay = var.evaluation_delay new_host_delay = var.new_host_delay notify_audit = false locked = false include_tags = true require_full_window = true notify_no_data = false tags = concat(["env:${var.environment}", "type:database", "provider:elasticsearch", "resource:elasticsearch", "team:claranet", "created-by:terraform"], var.task_time_in_queue_change_extra_tags) lifecycle { ignore_changes = [silenced] } }