diff --git a/README.md b/README.md index 2fc66d2..1403e5b 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/cluster/) - [ingress](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/ingress/) - [vts](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/ingress/vts/) + - [node](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/node/) - [pod](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/pod/) - [workload](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/workload/) - [cloud](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/) diff --git a/caas/kubernetes/node/README.md b/caas/kubernetes/node/README.md new file mode 100644 index 0000000..91d16bb --- /dev/null +++ b/caas/kubernetes/node/README.md @@ -0,0 +1,100 @@ +# CAAS KUBERNETES NODE DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-caas-kubernetes-node" { + source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/node?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Kubernetes Node Disk pressure +- Kubernetes Node Frequent unregister net device +- Kubernetes Node Kubelet API does not respond +- Kubernetes Node Kubelet sync loop that updates containers does not work +- Kubernetes Node Memory pressure +- Kubernetes Node not ready +- Kubernetes Node Out of disk +- Kubernetes Node unschedulable + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| disk\_out\_enabled | Flag to enable Out of disk monitor | string | `"true"` | no | +| disk\_out\_extra\_tags | Extra tags for Out of disk monitor | list | `[]` | no | +| disk\_out\_message | Custom message for Out of disk monitor | string | `""` | no | +| disk\_out\_silenced | Groups to mute for Out of disk monitor | map | `{}` | no | +| disk\_out\_threshold\_warning | Out of disk monitor (warning threshold) | string | `"3"` | no | +| disk\_pressure\_enabled | Flag to enable Disk pressure monitor | string | `"true"` | no | +| disk\_pressure\_extra\_tags | Extra tags for Disk pressure monitor | list | `[]` | no | +| disk\_pressure\_message | Custom message for Disk pressure monitor | string | `""` | no | +| disk\_pressure\_silenced | Groups to mute for Disk pressure monitor | map | `{}` | no | +| disk\_pressure\_threshold\_warning | Disk pressure monitor (warning threshold) | string | `"3"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"15"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| kubelet\_ping\_enabled | Flag to enable Kubelet ping monitor | string | `"true"` | no | +| kubelet\_ping\_extra\_tags | Extra tags for Kubelet ping monitor | list | `[]` | no | +| kubelet\_ping\_message | Custom message for Kubelet ping monitor | string | `""` | no | +| kubelet\_ping\_silenced | Groups to mute for Kubelet ping monitor | map | `{}` | no | +| kubelet\_ping\_threshold\_warning | Kubelet ping monitor (warning threshold) | string | `"3"` | no | +| kubelet\_syncloop\_enabled | Flag to enable Kubelet sync loop monitor | string | `"true"` | no | +| kubelet\_syncloop\_extra\_tags | Extra tags for Kubelet sync loop monitor | list | `[]` | no | +| kubelet\_syncloop\_message | Custom message for Kubelet sync loop monitor | string | `""` | no | +| kubelet\_syncloop\_silenced | Groups to mute for Kubelet sync loop monitor | map | `{}` | no | +| kubelet\_syncloop\_threshold\_warning | Kubelet sync loop monitor (warning threshold) | string | `"3"` | no | +| memory\_pressure\_enabled | Flag to enable Memory pressure monitor | string | `"true"` | no | +| memory\_pressure\_extra\_tags | Extra tags for Memory pressure monitor | list | `[]` | no | +| memory\_pressure\_message | Custom message for Memory pressure monitor | string | `""` | no | +| memory\_pressure\_silenced | Groups to mute for Memory pressure monitor | map | `{}` | no | +| memory\_pressure\_threshold\_warning | Memory pressure monitor (warning threshold) | string | `"3"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| node\_unschedulable\_enabled | Flag to enable node unschedulable monitor | string | `"true"` | no | +| node\_unschedulable\_extra\_tags | Extra tags for node unschedulable monitor | list | `[]` | no | +| node\_unschedulable\_message | Custom message for node unschedulable monitor | string | `""` | no | +| node\_unschedulable\_silenced | Groups to mute for node unschedulable monitor | map | `{}` | no | +| node\_unschedulable\_time\_aggregator | Monitor aggregator for node unschedulable [available values: min, max or avg] | string | `"min"` | no | +| node\_unschedulable\_timeframe | Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no | +| ready\_enabled | Flag to enable Node ready monitor | string | `"true"` | no | +| ready\_extra\_tags | Extra tags for Node ready monitor | list | `[]` | no | +| ready\_message | Custom message for Node ready monitor | string | `""` | no | +| ready\_silenced | Groups to mute for Node ready monitor | map | `{}` | no | +| ready\_threshold\_warning | Node ready monitor (warning threshold) | string | `"3"` | no | +| unregister\_net\_device\_enabled | Flag to enable Unregister net device monitor | string | `"true"` | no | +| unregister\_net\_device\_extra\_tags | Extra tags for Unregister net device monitor | list | `[]` | no | +| unregister\_net\_device\_message | Custom message for Unregister net device monitor | string | `""` | no | +| unregister\_net\_device\_silenced | Groups to mute for Unregister net device monitor | map | `{}` | no | +| unregister\_net\_device\_threshold\_critical | Unregister net device critical threshold | string | `"3"` | no | +| unregister\_net\_device\_time\_aggregator | Monitor aggregator for Unregister net device [available values: min, max or avg] | string | `"min"` | no | +| unregister\_net\_device\_timeframe | Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"15m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| disk\_out\_id | id for monitor disk_out | +| disk\_pressure\_id | id for monitor disk_pressure | +| kubelet\_ping\_id | id for monitor kubelet_ping | +| kubelet\_syncloop\_id | id for monitor kubelet_syncloop | +| memory\_pressure\_id | id for monitor memory_pressure | +| node\_unschedulable\_id | id for monitor node_unschedulable | +| ready\_id | id for monitor ready | +| unregister\_net\_device\_id | id for monitor unregister_net_device | + +## Related documentation + +* [Datadog metrics](https://docs.datadoghq.com/agent/kubernetes/metrics/) +* [Datadog documentation](https://docs.datadoghq.com/integrations/kubernetes/) +* [Datadog Blog](https://www.datadoghq.com/blog/monitor-kubernetes-docker/) diff --git a/caas/kubernetes/node/inputs.tf b/caas/kubernetes/node/inputs.tf new file mode 100644 index 0000000..5edcdf6 --- /dev/null +++ b/caas/kubernetes/node/inputs.tf @@ -0,0 +1,293 @@ +# Datadog global variables + +variable "environment" { + description = "Architecture environment" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 15 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +# Datadog monitors variables + +variable "disk_pressure_silenced" { + description = "Groups to mute for Disk pressure monitor" + type = "map" + default = {} +} + +variable "disk_pressure_enabled" { + description = "Flag to enable Disk pressure monitor" + type = "string" + default = "true" +} + +variable "disk_pressure_extra_tags" { + description = "Extra tags for Disk pressure monitor" + type = "list" + default = [] +} + +variable "disk_pressure_message" { + description = "Custom message for Disk pressure monitor" + type = "string" + default = "" +} + +variable "disk_pressure_threshold_warning" { + description = "Disk pressure monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "disk_out_silenced" { + description = "Groups to mute for Out of disk monitor" + type = "map" + default = {} +} + +variable "disk_out_enabled" { + description = "Flag to enable Out of disk monitor" + type = "string" + default = "true" +} + +variable "disk_out_extra_tags" { + description = "Extra tags for Out of disk monitor" + type = "list" + default = [] +} + +variable "disk_out_message" { + description = "Custom message for Out of disk monitor" + type = "string" + default = "" +} + +variable "disk_out_threshold_warning" { + description = "Out of disk monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "memory_pressure_silenced" { + description = "Groups to mute for Memory pressure monitor" + type = "map" + default = {} +} + +variable "memory_pressure_enabled" { + description = "Flag to enable Memory pressure monitor" + type = "string" + default = "true" +} + +variable "memory_pressure_extra_tags" { + description = "Extra tags for Memory pressure monitor" + type = "list" + default = [] +} + +variable "memory_pressure_message" { + description = "Custom message for Memory pressure monitor" + type = "string" + default = "" +} + +variable "memory_pressure_threshold_warning" { + description = "Memory pressure monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "ready_silenced" { + description = "Groups to mute for Node ready monitor" + type = "map" + default = {} +} + +variable "ready_enabled" { + description = "Flag to enable Node ready monitor" + type = "string" + default = "true" +} + +variable "ready_extra_tags" { + description = "Extra tags for Node ready monitor" + type = "list" + default = [] +} + +variable "ready_message" { + description = "Custom message for Node ready monitor" + type = "string" + default = "" +} + +variable "ready_threshold_warning" { + description = "Node ready monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "kubelet_ping_silenced" { + description = "Groups to mute for Kubelet ping monitor" + type = "map" + default = {} +} + +variable "kubelet_ping_enabled" { + description = "Flag to enable Kubelet ping monitor" + type = "string" + default = "true" +} + +variable "kubelet_ping_extra_tags" { + description = "Extra tags for Kubelet ping monitor" + type = "list" + default = [] +} + +variable "kubelet_ping_message" { + description = "Custom message for Kubelet ping monitor" + type = "string" + default = "" +} + +variable "kubelet_ping_threshold_warning" { + description = "Kubelet ping monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "kubelet_syncloop_silenced" { + description = "Groups to mute for Kubelet sync loop monitor" + type = "map" + default = {} +} + +variable "kubelet_syncloop_enabled" { + description = "Flag to enable Kubelet sync loop monitor" + type = "string" + default = "true" +} + +variable "kubelet_syncloop_extra_tags" { + description = "Extra tags for Kubelet sync loop monitor" + type = "list" + default = [] +} + +variable "kubelet_syncloop_message" { + description = "Custom message for Kubelet sync loop monitor" + type = "string" + default = "" +} + +variable "kubelet_syncloop_threshold_warning" { + description = "Kubelet sync loop monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "unregister_net_device_silenced" { + description = "Groups to mute for Unregister net device monitor" + type = "map" + default = {} +} + +variable "unregister_net_device_enabled" { + description = "Flag to enable Unregister net device monitor" + type = "string" + default = "true" +} + +variable "unregister_net_device_extra_tags" { + description = "Extra tags for Unregister net device monitor" + type = "list" + default = [] +} + +variable "unregister_net_device_message" { + description = "Custom message for Unregister net device monitor" + type = "string" + default = "" +} + +variable "unregister_net_device_time_aggregator" { + description = "Monitor aggregator for Unregister net device [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "unregister_net_device_timeframe" { + description = "Monitor timeframe for Unregister net device [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "15m" +} + +variable "unregister_net_device_threshold_critical" { + default = 3 + description = "Unregister net device critical threshold" +} + +variable "node_unschedulable_silenced" { + description = "Groups to mute for node unschedulable monitor" + type = "map" + default = {} +} + +variable "node_unschedulable_enabled" { + description = "Flag to enable node unschedulable monitor" + type = "string" + default = "true" +} + +variable "node_unschedulable_extra_tags" { + description = "Extra tags for node unschedulable monitor" + type = "list" + default = [] +} + +variable "node_unschedulable_message" { + description = "Custom message for node unschedulable monitor" + type = "string" + default = "" +} + +variable "node_unschedulable_time_aggregator" { + description = "Monitor aggregator for node unschedulable [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "node_unschedulable_timeframe" { + description = "Monitor timeframe for node unschedulable [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_1h" +} diff --git a/caas/kubernetes/node/modules.tf b/caas/kubernetes/node/modules.tf new file mode 100644 index 0000000..0dee522 --- /dev/null +++ b/caas/kubernetes/node/modules.tf @@ -0,0 +1,20 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "kubernetes" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" +} + +module "filter-tags-unschedulable" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "kubernetes" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" + extra_tags = ["status:unschedulable"] +} diff --git a/caas/kubernetes/node/monitors-k8s-node.tf b/caas/kubernetes/node/monitors-k8s-node.tf new file mode 100644 index 0000000..ac455f4 --- /dev/null +++ b/caas/kubernetes/node/monitors-k8s-node.tf @@ -0,0 +1,233 @@ +resource "datadog_monitor" "disk_pressure" { + count = "${var.disk_pressure_enabled == "true" ? 1 : 0}" + name = "[${var.environment}] Kubernetes Node Disk pressure" + message = "${coalesce(var.disk_pressure_message, var.message)}" + + type = "service check" + + query = < ${var.unregister_net_device_threshold_critical} + EOQ + + new_host_delay = "${var.new_host_delay}" + + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + + silenced = "${var.unregister_net_device_silenced}" + tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.unregister_net_device_extra_tags}"] +} + +resource "datadog_monitor" "node_unschedulable" { + count = "${var.node_unschedulable_enabled == "true" ? 1 : 0}" + name = "[${var.environment}] Kubernetes Node unschedulable" + type = "metric alert" + message = "${coalesce(var.node_unschedulable_message, var.message)}" + + query = < 0 + EOQ + + thresholds { + critical = 0 + } + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + notify_no_data = true + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + + silenced = "${var.node_unschedulable_silenced}" + tags = ["env:${var.environment}", "type:caas", "provider:kubernetes", "resource:kubernetes-node", "team:claranet", "created-by:terraform", "${var.node_unschedulable_extra_tags}"] +} diff --git a/caas/kubernetes/node/outputs.tf b/caas/kubernetes/node/outputs.tf new file mode 100644 index 0000000..0d0ffa3 --- /dev/null +++ b/caas/kubernetes/node/outputs.tf @@ -0,0 +1,39 @@ +output "disk_pressure_id" { + description = "id for monitor disk_pressure" + value = "${datadog_monitor.disk_pressure.*.id}" +} + +output "disk_out_id" { + description = "id for monitor disk_out" + value = "${datadog_monitor.disk_out.*.id}" +} + +output "memory_pressure_id" { + description = "id for monitor memory_pressure" + value = "${datadog_monitor.memory_pressure.*.id}" +} + +output "ready_id" { + description = "id for monitor ready" + value = "${datadog_monitor.ready.*.id}" +} + +output "kubelet_ping_id" { + description = "id for monitor kubelet_ping" + value = "${datadog_monitor.kubelet_ping.*.id}" +} + +output "kubelet_syncloop_id" { + description = "id for monitor kubelet_syncloop" + value = "${datadog_monitor.kubelet_syncloop.*.id}" +} + +output "unregister_net_device_id" { + description = "id for monitor unregister_net_device" + value = "${datadog_monitor.unregister_net_device.*.id}" +} + +output "node_unschedulable_id" { + description = "id for monitor node_unschedulable" + value = "${datadog_monitor.node_unschedulable.*.id}" +}