diff --git a/README.md b/README.md index f776ffc..2a9fd5a 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [cluster](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/cluster/) - [ingress](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/ingress/) - [vts](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/ingress/vts/) + - [workload](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/caas/kubernetes/workload/) - [cloud](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/) - [aws](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/) - [alb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/alb/) diff --git a/caas/kubernetes/workload/README.md b/caas/kubernetes/workload/README.md new file mode 100644 index 0000000..2e4eb76 --- /dev/null +++ b/caas/kubernetes/workload/README.md @@ -0,0 +1,82 @@ +# CAAS KUBERNETES WORKLOAD DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-caas-kubernetes-workload" { + source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//caas/kubernetes/workload?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Kubernetes Available replicas +- Kubernetes cronjob scheduling failed +- Kubernetes Current replicas +- Kubernetes job failed +- Kubernetes Ready replicas + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cronjob\_enabled | Flag to enable Cronjob monitor | string | `"true"` | no | +| cronjob\_extra\_tags | Extra tags for Cronjob monitor | list | `[]` | no | +| cronjob\_message | Custom message for Cronjob monitor | string | `""` | no | +| cronjob\_silenced | Groups to mute for Cronjob monitor | map | `{}` | no | +| cronjob\_threshold\_warning | Cronjob monitor (warning threshold) | string | `"3"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"15"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| job\_enabled | Flag to enable Job monitor | string | `"true"` | no | +| job\_extra\_tags | Extra tags for Job monitor | list | `[]` | no | +| job\_message | Custom message for Job monitor | string | `""` | no | +| job\_silenced | Groups to mute for Job monitor | map | `{}` | no | +| job\_threshold\_warning | Job monitor (warning threshold) | string | `"3"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| replica\_available\_enabled | Flag to enable Available replica monitor | string | `"true"` | no | +| replica\_available\_extra\_tags | Extra tags for Available replicamonitor | list | `[]` | no | +| replica\_available\_message | Custom message for Available replica monitor | string | `""` | no | +| replica\_available\_silenced | Groups to mute for Available replica monitor | map | `{}` | no | +| replica\_available\_threshold\_critical | Available replica critical threshold | string | `"1"` | no | +| replica\_available\_time\_aggregator | Monitor aggregator for Available replica [available values: min, max or avg] | string | `"max"` | no | +| replica\_available\_timeframe | Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | +| replica\_current\_enabled | Flag to enable Current replica monitor | string | `"true"` | no | +| replica\_current\_extra\_tags | Extra tags for Current replica monitor | list | `[]` | no | +| replica\_current\_message | Custom message for Current replica monitor | string | `""` | no | +| replica\_current\_silenced | Groups to mute for Current replica monitor | map | `{}` | no | +| replica\_current\_threshold\_critical | Current replica critical threshold | string | `"1"` | no | +| replica\_current\_time\_aggregator | Monitor aggregator for Current replica [available values: min, max or avg] | string | `"max"` | no | +| replica\_current\_timeframe | Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | +| replica\_ready\_enabled | Flag to enable Ready replica monitor | string | `"true"` | no | +| replica\_ready\_extra\_tags | Extra tags for Ready replica monitor | list | `[]` | no | +| replica\_ready\_message | Custom message for Ready replica monitor | string | `""` | no | +| replica\_ready\_silenced | Groups to mute for Ready replica monitor | map | `{}` | no | +| replica\_ready\_threshold\_critical | Ready replica critical threshold | string | `"1"` | no | +| replica\_ready\_time\_aggregator | Monitor aggregator for Ready replica [available values: min, max or avg] | string | `"max"` | no | +| replica\_ready\_timeframe | Monitor timeframe for Ready replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| cronjob\_id | id for monitor cronjob | +| job\_id | id for monitor job | +| replica\_available\_id | id for monitor replica_available | +| replica\_current\_id | id for monitor replica_current | +| replica\_ready\_id | id for monitor replica_ready | + +## Related documentation + +* [Datadog metrics](https://docs.datadoghq.com/agent/kubernetes/metrics/) +* [Datadog documentation](https://docs.datadoghq.com/integrations/kubernetes/) +* [Datadog Blog](https://www.datadoghq.com/blog/monitor-kubernetes-docker/) diff --git a/caas/kubernetes/workload/inputs.tf b/caas/kubernetes/workload/inputs.tf new file mode 100644 index 0000000..c526e96 --- /dev/null +++ b/caas/kubernetes/workload/inputs.tf @@ -0,0 +1,219 @@ +# Datadog global variables + +variable "environment" { + description = "Architecture environment" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 15 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +# Datadog monitors variables + +variable "job_silenced" { + description = "Groups to mute for Job monitor" + type = "map" + default = {} +} + +variable "job_enabled" { + description = "Flag to enable Job monitor" + type = "string" + default = "true" +} + +variable "job_extra_tags" { + description = "Extra tags for Job monitor" + type = "list" + default = [] +} + +variable "job_message" { + description = "Custom message for Job monitor" + type = "string" + default = "" +} + +variable "job_threshold_warning" { + description = "Job monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "cronjob_silenced" { + description = "Groups to mute for Cronjob monitor" + type = "map" + default = {} +} + +variable "cronjob_enabled" { + description = "Flag to enable Cronjob monitor" + type = "string" + default = "true" +} + +variable "cronjob_extra_tags" { + description = "Extra tags for Cronjob monitor" + type = "list" + default = [] +} + +variable "cronjob_message" { + description = "Custom message for Cronjob monitor" + type = "string" + default = "" +} + +variable "cronjob_threshold_warning" { + description = "Cronjob monitor (warning threshold)" + type = "string" + default = 3 +} + +variable "replica_available_silenced" { + description = "Groups to mute for Available replica monitor" + type = "map" + default = {} +} + +variable "replica_available_enabled" { + description = "Flag to enable Available replica monitor" + type = "string" + default = "true" +} + +variable "replica_available_extra_tags" { + description = "Extra tags for Available replicamonitor" + type = "list" + default = [] +} + +variable "replica_available_message" { + description = "Custom message for Available replica monitor" + type = "string" + default = "" +} + +variable "replica_available_time_aggregator" { + description = "Monitor aggregator for Available replica [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "replica_available_timeframe" { + description = "Monitor timeframe for Available replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "replica_available_threshold_critical" { + default = 1 + description = "Available replica critical threshold" +} + +variable "replica_ready_silenced" { + description = "Groups to mute for Ready replica monitor" + type = "map" + default = {} +} + +variable "replica_ready_enabled" { + description = "Flag to enable Ready replica monitor" + type = "string" + default = "true" +} + +variable "replica_ready_extra_tags" { + description = "Extra tags for Ready replica monitor" + type = "list" + default = [] +} + +variable "replica_ready_message" { + description = "Custom message for Ready replica monitor" + type = "string" + default = "" +} + +variable "replica_ready_time_aggregator" { + description = "Monitor aggregator for Ready replica [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "replica_ready_timeframe" { + description = "Monitor timeframe for Ready replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "replica_ready_threshold_critical" { + default = 1 + description = "Ready replica critical threshold" +} + +variable "replica_current_silenced" { + description = "Groups to mute for Current replica monitor" + type = "map" + default = {} +} + +variable "replica_current_enabled" { + description = "Flag to enable Current replica monitor" + type = "string" + default = "true" +} + +variable "replica_current_extra_tags" { + description = "Extra tags for Current replica monitor" + type = "list" + default = [] +} + +variable "replica_current_message" { + description = "Custom message for Current replica monitor" + type = "string" + default = "" +} + +variable "replica_current_time_aggregator" { + description = "Monitor aggregator for Current replica [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "replica_current_timeframe" { + description = "Monitor timeframe for Current replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "replica_current_threshold_critical" { + default = 1 + description = "Current replica critical threshold" +} diff --git a/caas/kubernetes/workload/modules.tf b/caas/kubernetes/workload/modules.tf new file mode 100644 index 0000000..ad6b115 --- /dev/null +++ b/caas/kubernetes/workload/modules.tf @@ -0,0 +1,9 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "kubernetes" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" +} diff --git a/caas/kubernetes/workload/monitors-k8s-workload.tf b/caas/kubernetes/workload/monitors-k8s-workload.tf new file mode 100644 index 0000000..29a12cc --- /dev/null +++ b/caas/kubernetes/workload/monitors-k8s-workload.tf @@ -0,0 +1,155 @@ +resource "datadog_monitor" "job" { + count = "${var.job_enabled == "true" ? 1 : 0}" + name = "[${var.environment}] Kubernetes job failed" + message = "${coalesce(var.job_message, var.message)}" + + type = "service check" + + query = <