diff --git a/README.md b/README.md index 8f4a8f2..f1093fe 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [sql-database](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/sql-database/) - [storage](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/storage/) - [stream-analytics](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/stream-analytics/) + - [virtual-machine](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/virtual-machine/) - [gcp](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/) - [big-query](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/big-query/) - [cloud-sql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/) diff --git a/cloud/azure/virtual-machine/README.md b/cloud/azure/virtual-machine/README.md new file mode 100644 index 0000000..a19023f --- /dev/null +++ b/cloud/azure/virtual-machine/README.md @@ -0,0 +1,67 @@ +# CLOUD AZURE VIRTUAL-MACHINE DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-azure-virtual-machine" { + source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/virtual-machine?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Virtual Machine CPU usage +- Virtual Machine credit CPU too low +- Virtual Machine is down + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu\_remaining\_rate\_enabled | Flag to enable Virtual Machine CPU remaining monitor | string | `"true"` | no | +| cpu\_remaining\_rate\_extra\_tags | Extra tags for Virtual Machine CPU remaining monitor | list | `[]` | no | +| cpu\_remaining\_rate\_message | Custom message for Virtual Machine CPU remaining monitor | string | `""` | no | +| cpu\_remaining\_rate\_silenced | Groups to mute for Virtual Machine CPU remaining monitor | map | `{}` | no | +| cpu\_remaining\_rate\_threshold\_critical | Jobs Failed rate limit (critical threshold) | string | `"15"` | no | +| cpu\_remaining\_rate\_threshold\_warning | Jobs Failed rate limit (warning threshold) | string | `"30"` | no | +| cpu\_remaining\_rate\_time\_aggregator | Monitor aggregator for Virtual Machine CPU remaining [available values: min, max, sum or avg] | string | `"min"` | no | +| cpu\_remaining\_rate\_timeframe | Monitor timeframe for Virtual Machine CPU remaining [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| cpu\_usage\_enabled | Flag to enable Virtual Machine status monitor | string | `"true"` | no | +| cpu\_usage\_extra\_tags | Extra tags for Virtual Machine status monitor | list | `[]` | no | +| cpu\_usage\_message | Custom message for Virtual Machine CPU monitor | string | `""` | no | +| cpu\_usage\_silenced | Groups to mute for Virtual Machine CPU monitor | map | `{}` | no | +| cpu\_usage\_threshold\_critical | Virtual Machine CPU usage in percent (critical threshold) | string | `"90"` | no | +| cpu\_usage\_threshold\_warning | Virtual Machine CPU usage in percent (warning threshold) | string | `"80"` | no | +| cpu\_usage\_time\_aggregator | Monitor aggregator for Virtual Machine CPU [available values: min, max or avg] | string | `"min"` | no | +| cpu\_usage\_timeframe | Monitor timeframe for Virtual Machine CPU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_15m"` | no | +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| message | Message sent when a Redis monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | +| status\_enabled | Flag to enable Virtual Machine status monitor | string | `"true"` | no | +| status\_extra\_tags | Extra tags for Virtual Machine status monitor | list | `[]` | no | +| status\_message | Custom message for Virtual Machine status monitor | string | `""` | no | +| status\_silenced | Groups to mute for Virtual Machine status monitor | map | `{}` | no | +| status\_time\_aggregator | Monitor aggregator for Virtual Machine status [available values: min, max or avg] | string | `"max"` | no | +| status\_timeframe | Monitor timeframe for Virtual Machine status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| virtualmachine\_cpu\_usage\_id | id for monitor virtualmachine_cpu_usage | +| virtualmachine\_credit\_cpu\_remaining\_too\_low\_id | id for monitor virtualmachine_credit_cpu_remaining_too_low | +| virtualmachine\_status\_id | id for monitor virtualmachine_status | + +## Related documentation + +DataDog documentation: [https://docs.datadoghq.com/integrations/azure_vm/](https://docs.datadoghq.com/integrations/azure_vm/) diff --git a/cloud/azure/virtual-machine/inputs.tf b/cloud/azure/virtual-machine/inputs.tf new file mode 100644 index 0000000..3720bf7 --- /dev/null +++ b/cloud/azure/virtual-machine/inputs.tf @@ -0,0 +1,164 @@ +# Global Terraform +variable "environment" { + description = "Architecture environment" + type = "string" +} + +# Global DataDog +variable "message" { + description = "Message sent when a Redis monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +# Azure Virtual Machine specific variables +variable "status_silenced" { + description = "Groups to mute for Virtual Machine status monitor" + type = "map" + default = {} +} + +variable "status_enabled" { + description = "Flag to enable Virtual Machine status monitor" + type = "string" + default = "true" +} + +variable "status_extra_tags" { + description = "Extra tags for Virtual Machine status monitor" + type = "list" + default = [] +} + +variable "status_message" { + description = "Custom message for Virtual Machine status monitor" + type = "string" + default = "" +} + +variable "status_time_aggregator" { + description = "Monitor aggregator for Virtual Machine status [available values: min, max or avg]" + type = "string" + default = "max" +} + +variable "status_timeframe" { + description = "Monitor timeframe for Virtual Machine status [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "cpu_usage_silenced" { + description = "Groups to mute for Virtual Machine CPU monitor" + type = "map" + default = {} +} + +variable "cpu_usage_enabled" { + description = "Flag to enable Virtual Machine status monitor" + type = "string" + default = "true" +} + +variable "cpu_usage_extra_tags" { + description = "Extra tags for Virtual Machine status monitor" + type = "list" + default = [] +} + +variable "cpu_usage_message" { + description = "Custom message for Virtual Machine CPU monitor" + type = "string" + default = "" +} + +variable "cpu_usage_time_aggregator" { + description = "Monitor aggregator for Virtual Machine CPU [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "cpu_usage_timeframe" { + description = "Monitor timeframe for Virtual Machine CPU [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_15m" +} + +variable "cpu_usage_threshold_warning" { + description = "Virtual Machine CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_usage_threshold_critical" { + description = "Virtual Machine CPU usage in percent (critical threshold)" + default = "90" +} + +variable "cpu_remaining_rate_silenced" { + description = "Groups to mute for Virtual Machine CPU remaining monitor" + type = "map" + default = {} +} + +variable "cpu_remaining_rate_enabled" { + description = "Flag to enable Virtual Machine CPU remaining monitor" + type = "string" + default = "true" +} + +variable "cpu_remaining_rate_extra_tags" { + description = "Extra tags for Virtual Machine CPU remaining monitor" + type = "list" + default = [] +} + +variable "cpu_remaining_rate_message" { + description = "Custom message for Virtual Machine CPU remaining monitor" + type = "string" + default = "" +} + +variable "cpu_remaining_rate_time_aggregator" { + description = "Monitor aggregator for Virtual Machine CPU remaining [available values: min, max, sum or avg]" + type = "string" + default = "min" +} + +variable "cpu_remaining_rate_timeframe" { + description = "Monitor timeframe for Virtual Machine CPU remaining [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "cpu_remaining_rate_threshold_warning" { + description = "Jobs Failed rate limit (warning threshold)" + default = 30 +} + +variable "cpu_remaining_rate_threshold_critical" { + description = "Jobs Failed rate limit (critical threshold)" + default = 15 +} diff --git a/cloud/azure/virtual-machine/modules.tf b/cloud/azure/virtual-machine/modules.tf new file mode 100644 index 0000000..9967458 --- /dev/null +++ b/cloud/azure/virtual-machine/modules.tf @@ -0,0 +1,9 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "azure_virtual-machine" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" +} diff --git a/cloud/azure/virtual-machine/monitors-virtual-machine.tf b/cloud/azure/virtual-machine/monitors-virtual-machine.tf new file mode 100644 index 0000000..2ffe4d5 --- /dev/null +++ b/cloud/azure/virtual-machine/monitors-virtual-machine.tf @@ -0,0 +1,97 @@ +resource "datadog_monitor" "virtualmachine_status" { + count = "${var.status_enabled ? 1 : 0}" + + name = "[${var.environment}] Virtual Machine is down" + message = "${coalesce(var.status_message, var.message)}" + + query = < ${var.cpu_usage_threshold_critical} + EOF + + type = "metric alert" + + thresholds { + critical = "${var.cpu_usage_threshold_critical}" + warning = "${var.cpu_usage_threshold_warning}" + } + + silenced = "${var.cpu_usage_silenced}" + + notify_no_data = false + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = 0 + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = false + new_host_delay = "${var.new_host_delay}" + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:virtualmachine", "team:claranet", "created-by:terraform", "${var.cpu_usage_extra_tags}"] +} + +resource "datadog_monitor" "virtualmachine_credit_cpu_remaining_too_low" { + count = "${var.cpu_remaining_rate_enabled ? 1 : 0}" + name = "[${var.environment}] Virtual Machine credit CPU too low {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + message = "${coalesce(var.cpu_remaining_rate_message, var.message)}" + + query = <