MON-96 update system readme and add load monitor

This commit is contained in:
Quentin Manfroi 2018-03-23 23:22:29 +01:00
parent 88ae2f4c51
commit ca9cdff481
3 changed files with 70 additions and 1 deletions

View File

@ -17,7 +17,11 @@ Purpose
-------
Creates a DataDog monitors with the following checks :
* System CPU High
* CPU usage
* CPU load ratio
* Free memory
* Free disk inodes
* Free disk space
Inputs
------
@ -29,6 +33,11 @@ Inputs
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no |
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no |
| cpu_load_timeframe | CPU load timeframe | string | `last_5m` | no |
| environment | Architecture Environment | string | - | yes |
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |

View File

@ -53,6 +53,33 @@ variable "cpu_high_threshold_critical" {
default = 95
}
variable "cpu_load_silenced" {
description = "Groups to mute for CPU load ratio monitor"
type = "map"
default = {}
}
variable "cpu_load_message" {
description = "Custom message for CPU load ratio monitor"
type = "string"
default = ""
}
variable "cpu_load_timeframe" {
description = "CPU load ratio timeframe"
default = "last_5m"
}
variable "cpu_load_threshold_warning" {
description = "CPU load ratio warning threshold"
default = 3
}
variable "cpu_load_threshold_critical" {
description = "CPU load ratio critical threshold"
default = 4
}
variable "free_disk_space_silenced" {
description = "Groups to mute for Free diskspace monitor"
type = "map"

View File

@ -38,6 +38,39 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
silenced = "${var.cpu_high_silenced}"
}
resource "datadog_monitor" "datadog_load_too_high" {
name = "[${var.environment}] CPU load 5 {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_load_message, var.message)}"
query = <<EOF
min(${var.cpu_load_timeframe}): (
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
) > ${var.cpu_load_threshold_critical}
EOF
type = "metric alert"
thresholds {
warning = "${var.cpu_load_threshold_warning}"
critical = "${var.cpu_load_threshold_critical}"
}
tags = ["env:${var.environment}", "type:system", "resource:load"]
notify_no_data = true
evaluation_delay = "${var.delay}"
new_host_delay = "${var.delay}"
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
no_data_timeframe = 20
silenced = "${var.cpu_load_silenced}"
}
resource "datadog_monitor" "datadog_free_disk_space_too_low" {
name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.free_disk_space_message, var.message)}"