MON-96 update system readme and add load monitor
This commit is contained in:
parent
88ae2f4c51
commit
ca9cdff481
@ -17,7 +17,11 @@ Purpose
|
|||||||
-------
|
-------
|
||||||
Creates a DataDog monitors with the following checks :
|
Creates a DataDog monitors with the following checks :
|
||||||
|
|
||||||
* System CPU High
|
* CPU usage
|
||||||
|
* CPU load ratio
|
||||||
|
* Free memory
|
||||||
|
* Free disk inodes
|
||||||
|
* Free disk space
|
||||||
|
|
||||||
Inputs
|
Inputs
|
||||||
------
|
------
|
||||||
@ -29,6 +33,11 @@ Inputs
|
|||||||
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
|
| cpu_high_threshold_critical | CPU high critical threshold | string | `95` | no |
|
||||||
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
|
| cpu_high_threshold_warning | CPU high warning threshold | string | `80` | no |
|
||||||
| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no |
|
| cpu_high_timeframe | CPU high timeframe | string | `last_5m` | no |
|
||||||
|
| cpu_load_message | Custom message for CPU load ratio monitor | string | `` | no |
|
||||||
|
| cpu_load_silenced | Groups to mute for CPU load ratio monitor | map | `<map>` | no |
|
||||||
|
| cpu_load_threshold_critical | CPU load ratio critical threshold | string | `4` | no |
|
||||||
|
| cpu_load_threshold_warning | CPU load ratio warning threshold | string | `3` | no |
|
||||||
|
| cpu_load_timeframe | CPU load timeframe | string | `last_5m` | no |
|
||||||
| environment | Architecture Environment | string | - | yes |
|
| environment | Architecture Environment | string | - | yes |
|
||||||
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
| delay | Delay in seconds for the metric evaluation | string | `15` | no |
|
||||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||||
|
|||||||
@ -53,6 +53,33 @@ variable "cpu_high_threshold_critical" {
|
|||||||
default = 95
|
default = 95
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "cpu_load_silenced" {
|
||||||
|
description = "Groups to mute for CPU load ratio monitor"
|
||||||
|
type = "map"
|
||||||
|
default = {}
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_load_message" {
|
||||||
|
description = "Custom message for CPU load ratio monitor"
|
||||||
|
type = "string"
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_load_timeframe" {
|
||||||
|
description = "CPU load ratio timeframe"
|
||||||
|
default = "last_5m"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_load_threshold_warning" {
|
||||||
|
description = "CPU load ratio warning threshold"
|
||||||
|
default = 3
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cpu_load_threshold_critical" {
|
||||||
|
description = "CPU load ratio critical threshold"
|
||||||
|
default = 4
|
||||||
|
}
|
||||||
|
|
||||||
variable "free_disk_space_silenced" {
|
variable "free_disk_space_silenced" {
|
||||||
description = "Groups to mute for Free diskspace monitor"
|
description = "Groups to mute for Free diskspace monitor"
|
||||||
type = "map"
|
type = "map"
|
||||||
|
|||||||
@ -38,6 +38,39 @@ resource "datadog_monitor" "datadog_cpu_too_high" {
|
|||||||
silenced = "${var.cpu_high_silenced}"
|
silenced = "${var.cpu_high_silenced}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "datadog_monitor" "datadog_load_too_high" {
|
||||||
|
name = "[${var.environment}] CPU load 5 {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
|
message = "${coalesce(var.cpu_load_message, var.message)}"
|
||||||
|
|
||||||
|
query = <<EOF
|
||||||
|
min(${var.cpu_load_timeframe}): (
|
||||||
|
avg:system.load.5{${data.template_file.filter.rendered}} by {region,host} /
|
||||||
|
avg:system.core.count{${data.template_file.filter.rendered}} by {region,host}
|
||||||
|
) > ${var.cpu_load_threshold_critical}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
type = "metric alert"
|
||||||
|
|
||||||
|
thresholds {
|
||||||
|
warning = "${var.cpu_load_threshold_warning}"
|
||||||
|
critical = "${var.cpu_load_threshold_critical}"
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["env:${var.environment}", "type:system", "resource:load"]
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
evaluation_delay = "${var.delay}"
|
||||||
|
new_host_delay = "${var.delay}"
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
require_full_window = true
|
||||||
|
no_data_timeframe = 20
|
||||||
|
|
||||||
|
silenced = "${var.cpu_load_silenced}"
|
||||||
|
}
|
||||||
|
|
||||||
resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
resource "datadog_monitor" "datadog_free_disk_space_too_low" {
|
||||||
name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
name = "[${var.environment}] Free disk space {{#is_alert}}{{comparator}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{comparator}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
message = "${coalesce(var.free_disk_space_message, var.message)}"
|
message = "${coalesce(var.free_disk_space_message, var.message)}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user