MON-191 - Generic system monitors updated with customizable aggregator

This commit is contained in:
Alexandre Gaillet 2018-04-30 15:50:58 +02:00 committed by Quentin Manfroi
parent 5bd3cab9ba
commit 4d1e840b7f
4 changed files with 53 additions and 18 deletions

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "status" {
message = "${coalesce(var.status_message, var.message)}"
query = <<EOF
avg(${var.status_timeframe}):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
${var.status_aggregator}(${var.status_timeframe}):avg:azure.cache_redis.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
EOF
type = "metric alert"
@ -36,8 +36,8 @@ resource "datadog_monitor" "evictedkeys" {
message = "${coalesce(var.evictedkeys_limit_message, var.message)}"
query = <<EOF
avg(${var.evictedkeys_limit_timeframe}): (
avg:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {resource_group,region,name}
${var.evictedkeys_limit_aggregator}(${var.evictedkeys_limit_timeframe}): (
${var.evictedkeys_limit_aggregator}:azure.cache_redis.evictedkeys{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.evictedkeys_limit_threshold_critical}
EOF
@ -68,8 +68,8 @@ resource "datadog_monitor" "percent_processor_time" {
message = "${coalesce(var.percent_processor_time_message, var.message)}"
query = <<EOF
avg(${var.percent_processor_time_timeframe}): (
avg:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
${var.percent_processor_time_aggregator}(${var.percent_processor_time_timeframe}): (
${var.percent_processor_time_aggregator}:azure.cache_redis.percent_processor_time{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.percent_processor_time_threshold_critical}
EOF

View File

@ -28,11 +28,13 @@ Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| cpu_high_aggregator | Monitor aggregator for CPU high [available values: min, max, sum or avg] | string | `min` | no |
| cpu_high_message | Custom message for CPU high monitor | string | `` | no |
| cpu_high_silenced | Groups to mute for CPU high monitor | map | `<map>` | no |
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
| cpu_high_timeframe | Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| cpu_load_aggregator | Monitor aggregator for CPU load ratio [available values: min, max, sum or avg] | string | `min` | no |
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
@ -42,16 +44,19 @@ Inputs
| environment | Architecture Environment | string | - | yes |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
| free_disk_inodes_aggregator | Monitor aggregator for Free disk inodes [available values: min, max, sum or avg] | string | `min` | no |
| free_disk_inodes_message | Custom message for Free disk inodes monitor | string | `` | no |
| free_disk_inodes_silenced | Groups to mute for Free disk inodes monitor | map | `<map>` | no |
| free_disk_inodes_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_disk_inodes_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_disk_inodes_timeframe | Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_disk_space_aggregator | Monitor aggregator for Free diskspace [available values: min, max, sum or avg] | string | `min` | no |
| free_disk_space_message | Custom message for Free diskspace monitor | string | `` | no |
| free_disk_space_silenced | Groups to mute for Free diskspace monitor | map | `<map>` | no |
| free_disk_space_threshold_critical | Free disk space critical threshold | string | `5` | no |
| free_disk_space_threshold_warning | Free disk space warning threshold | string | `10` | no |
| free_disk_space_timeframe | Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `last_5m` | no |
| free_memory_aggregator | Monitor aggregator for Free memory [available values: min, max, sum or avg] | string | `min` | no |
| free_memory_message | Custom message for Free memory monitor | string | - | yes |
| free_memory_silenced | Groups to mute for Free memory monitor | map | `<map>` | no |
| free_memory_threshold_critical | Free disk space critical threshold | string | `5` | no |

View File

@ -38,6 +38,12 @@ variable "cpu_high_message" {
default = ""
}
variable "cpu_high_aggregator" {
description = "Monitor aggregator for CPU high [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "cpu_high_timeframe" {
description = "Monitor timeframe for CPU high [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -66,6 +72,12 @@ variable "cpu_load_message" {
default = ""
}
variable "cpu_load_aggregator" {
description = "Monitor aggregator for CPU load ratio [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "cpu_load_timeframe" {
description = "Monitor timeframe for CPU load ratio [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -94,6 +106,12 @@ variable "free_disk_space_message" {
default = ""
}
variable "free_disk_space_aggregator" {
description = "Monitor aggregator for Free diskspace [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "free_disk_space_timeframe" {
description = "Monitor timeframe for Free diskspace [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -122,6 +140,12 @@ variable "free_disk_inodes_message" {
default = ""
}
variable "free_disk_inodes_aggregator" {
description = "Monitor aggregator for Free disk inodes [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "free_disk_inodes_timeframe" {
description = "Monitor timeframe for Free disk inodes [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
@ -149,6 +173,12 @@ variable "free_memory_message" {
type = "string"
}
variable "free_memory_aggregator" {
description = "Monitor aggregator for Free memory [available values: min, max, sum or avg]"
type = "string"
default = "min"
}
variable "free_memory_timeframe" {
description = "Monitor timeframe for Free memory [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"

View File

@ -11,7 +11,7 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
message = "${coalesce(var.cpu_high_message, var.message)}"
query = <<EOF
min(${var.cpu_high_timeframe}): (
${var.cpu_high_aggregator}(${var.cpu_high_timeframe}): (
100 - avg:system.cpu.idle{${data.template_file.filter.rendered}} by {region,host}
) > ${var.cpu_high_threshold_critical}
EOF
@ -42,9 +42,9 @@ resource "datadog_monitor" "datadog_load_too_high" {
message = "${coalesce(var.cpu_load_message, var.message)}"
query = <<EOF
min(${var.cpu_load_timeframe}): (
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
${var.cpu_load_aggregator}(${var.cpu_load_timeframe}): (
${var.cpu_load_aggregator}:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
${var.cpu_load_aggregator}:system.core.count{${data.template_file.filter.rendered}} by {region,host}
) > ${var.cpu_load_threshold_critical}
EOF
@ -74,9 +74,9 @@ resource "datadog_monitor" "datadog_free_disk_space_too_low" {
message = "${coalesce(var.free_disk_space_message, var.message)}"
query = <<EOF
min(${var.free_disk_space_timeframe}): (
avg:system.disk.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
avg:system.disk.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
${var.free_disk_space_aggregator}(${var.free_disk_space_timeframe}): (
${var.free_disk_space_aggregator}:system.disk.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
${var.free_disk_space_aggregator}:system.disk.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
) < ${var.free_disk_space_threshold_critical}
EOF
@ -106,9 +106,9 @@ resource "datadog_monitor" "datadog_free_disk_space_inodes_too_low" {
message = "${coalesce(var.free_disk_inodes_message, var.message)}"
query = <<EOF
min(${var.free_disk_inodes_timeframe}): (
avg:system.fs.inodes.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
avg:system.fs.inodes.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
${var.free_disk_inodes_aggregator}(${var.free_disk_inodes_timeframe}): (
${var.free_disk_inodes_aggregator}:system.fs.inodes.free{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} /
${var.free_disk_inodes_aggregator}:system.fs.inodes.total{${data.template_file.filter.rendered},dd_disk:enabled} by {region,host,device} * 100
) < ${var.free_disk_inodes_threshold_critical}
EOF
@ -138,9 +138,9 @@ resource "datadog_monitor" "datadog_free_memory" {
message = "${var.free_memory_message}"
query = <<EOF
min(${var.free_memory_timeframe}): (
avg:system.mem.free{${data.template_file.filter.rendered}} by {region,host} /
avg:system.mem.total{${data.template_file.filter.rendered}} by {region,host} * 100
${var.free_memory_aggregator}(${var.free_memory_timeframe}): (
${var.free_memory_aggregator}:system.mem.free{${data.template_file.filter.rendered}} by {region,host} /
${var.free_memory_aggregator}:system.mem.total{${data.template_file.filter.rendered}} by {region,host} * 100
) < ${var.free_memory_threshold_critical}
EOF