MON-96 update system readme and add load monitor
This commit is contained in:
parent
88ae2f4c51
commit
ca9cdff481
@ -17,7 +17,11 @@ Purpose
|
||||
-------
|
||||
Creates a DataDog monitors with the following checks :
|
||||
|
||||
* System CPU High
|
||||
* CPU usage
|
||||
* CPU load ratio
|
||||
* Free memory
|
||||
* Free disk inodes
|
||||
* Free disk space
|
||||
|
||||
Inputs
|
||||
------
|
||||
@ -29,6 +33,11 @@ Inputs
|
||||
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
|
||||
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
|
||||
| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no |
|
||||
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
|
||||
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
|
||||
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
|
||||
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no |
|
||||
| cpu_load_timeframe | CPU load timeframe | string | `last_5m` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
|
||||
@ -53,6 +53,33 @@ variable "cpu_high_threshold_critical" {
|
||||
default = 95
|
||||
}
|
||||
|
||||
variable "cpu_load_silenced" {
|
||||
description = "Groups to mute for CPU load ratio monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "cpu_load_message" {
|
||||
description = "Custom message for CPU load ratio monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "cpu_load_timeframe" {
|
||||
description = "CPU load ratio timeframe"
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "cpu_load_threshold_warning" {
|
||||
description = "CPU load ratio warning threshold"
|
||||
default = 3
|
||||
}
|
||||
|
||||
variable "cpu_load_threshold_critical" {
|
||||
description = "CPU load ratio critical threshold"
|
||||
default = 4
|
||||
}
|
||||
|
||||
variable "free_disk_space_silenced" {
|
||||
description = "Groups to mute for Free diskspace monitor"
|
||||
type = "map"
|
||||
|
||||
@ -38,6 +38,39 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
|
||||
silenced = "${var.cpu_high_silenced}"
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_load_too_high" {
|
||||
name = "[${var.environment}] CPU load 5 {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_load_message, var.message)}"
|
||||
|
||||
query = <<EOF
|
||||
min(${var.cpu_load_timeframe}): (
|
||||
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
|
||||
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
|
||||
) > ${var.cpu_load_threshold_critical}
|
||||
EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
thresholds {
|
||||
warning = "${var.cpu_load_threshold_warning}"
|
||||
critical = "${var.cpu_load_threshold_critical}"
|
||||
}
|
||||
|
||||
tags = ["env:${var.environment}", "type:system", "resource:load"]
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
new_host_delay = "${var.delay}"
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
no_data_timeframe = 20
|
||||
|
||||
silenced = "${var.cpu_load_silenced}"
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
||||
name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.free_disk_space_message, var.message)}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user