MON-228 improve service check

This commit is contained in:
Quentin Manfroi 2018-09-11 15:56:40 +02:00
parent 391240f597
commit f0e252df33
4 changed files with 57 additions and 69 deletions

View File

@ -170,13 +170,11 @@ Creates DataDog monitors with the following checks:
| node_free_space_threshold_warning | Cluster Status warning threshold | string | `20` | no |
| node_free_space_time_aggregator | Time aggregator for the Cluster Status monitor | string | `sum` | no |
| node_free_space_timeframe | Timeframe for the Cluster Status monitor | string | `last_5m` | no |
| not_responding_by | Group by for the service check | string | `"port","server"` | no |
| not_responding_extra_tags | Extra tags for Elasticsearch does not respond monitor | list | `<list>` | no |
| not_responding_last | Parameter 'last' for the service check | string | `1` | no |
| not_responding_message | Custom message for Elasticsearch does not respond monitor | string | `` | no |
| not_responding_no_data_timeframe | Elasticsearch not responding monitor no data timeframe | string | `10` | no |
| not_responding_silenced | Groups to mute for Elasticsearch does not respond monitor | map | `<map>` | no |
| not_responding_threshold_critical | Not responding limit (critical threshold) | string | `5` | no |
| not_responding_threshold_warning | Not responding limit (warning threshold) | string | `0` | no |
| not_responding_threshold_warning | Elasticsearch not responding limit (warning threshold) | string | `3` | no |
| query_cache_evictions_change_extra_tags | Extra tags for Cluster Status monitor | list | `<list>` | no |
| query_cache_evictions_change_message | Custom message for the Cluster Status monitor | string | `` | no |
| query_cache_evictions_change_silenced | Groups to mute for Cluster Status monitor | map | `<map>` | no |

View File

@ -1068,26 +1068,15 @@ variable "not_responding_message" {
default = ""
}
variable "not_responding_by" {
description = "Group by for the service check"
type = "string"
default = "\"port\",\"server\""
}
variable "not_responding_last" {
description = "Parameter 'last' for the service check"
type = "string"
default = 1
}
variable "not_responding_threshold_critical" {
description = "Not responding limit (critical threshold)"
default = 5
}
variable "not_responding_threshold_warning" {
description = "Not responding limit (warning threshold)"
default = 0
description = "Elasticsearch not responding limit (warning threshold)"
default = 3
}
variable "not_responding_no_data_timeframe" {
description = "Elasticsearch not responding monitor no data timeframe"
type = "string"
default = 10
}
variable "not_responding_extra_tags" {

View File

@ -1,3 +1,45 @@
#
# Service Check
#
resource "datadog_monitor" "not_responding" {
name = "[${var.environment}] ElasticSearch does not respond"
message = "${coalesce(var.not_responding_message, var.message)}"
query = <<EOL
"elasticsearch.can_connect".over${module.filter-tags.service_check}.by("server","port").last(6).count_by_status()
EOL
type = "service check"
thresholds {
warning = "${var.not_responding_threshold_warning}"
critical = 5
}
silenced = "${var.not_responding_silenced}"
no_data_timeframe = "${var.not_responding_no_data_timeframe}"
notify_no_data = true
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = true
renotify_interval = 0
new_host_delay = "${var.new_host_delay}"
tags = [
"created-by:terraform",
"team:claranet",
"type:databases",
"provider:elasticsearch",
"env:${var.environment}",
"resource:elasticsearch",
"${var.not_responding_extra_tags}",
]
}
#
# Cluster Status Not Green
#
@ -877,44 +919,3 @@ EOF
"${var.task_time_in_queue_change_extra_tags}",
]
}
#
# Service Check
#
resource "datadog_monitor" "not_responding" {
name = "[${var.environment}] ElasticSearch does not respond"
message = "${coalesce(var.not_responding_message, var.message)}"
query = <<EOL
"elasticsearch.can_connect".over${module.filter-tags.service_check}.by(${var.not_responding_by}).last(${var.not_responding_last}).pct_by_status()
EOL
type = "service check"
thresholds {
warning = "${var.not_responding_threshold_warning}"
critical = "${var.not_responding_threshold_critical}"
}
silenced = "${var.not_responding_silenced}"
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = true
notify_no_data = true
renotify_interval = 0
new_host_delay = "${var.new_host_delay}"
tags = [
"created-by:terraform",
"team:claranet",
"type:databases",
"provider:elasticsearch",
"env:${var.environment}",
"resource:elasticsearch",
"${var.not_responding_extra_tags}",
]
}

View File

@ -1,3 +1,8 @@
output "not_responding_id" {
description = "id for monitor not_responding"
value = "${datadog_monitor.not_responding.*.id}"
}
output "cluster_status_not_green_id" {
description = "id for monitor cluster_status_not_green"
value = "${datadog_monitor.cluster_status_not_green.*.id}"
@ -102,8 +107,3 @@ output "task_time_in_queue_change_id" {
description = "id for monitor task_time_in_queue_change"
value = "${datadog_monitor.task_time_in_queue_change.*.id}"
}
output "not_responding_id" {
description = "id for monitor not_responding"
value = "${datadog_monitor.not_responding.*.id}"
}