diff --git a/cloud/aws/rds-mysql/inputs.tf b/cloud/aws/rds-mysql/inputs.tf deleted file mode 120000 index a68ace3..0000000 --- a/cloud/aws/rds-mysql/inputs.tf +++ /dev/null @@ -1 +0,0 @@ -../../../inputs.tf \ No newline at end of file diff --git a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf b/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf deleted file mode 100644 index 87e0746..0000000 --- a/cloud/aws/rds-mysql/monitors-rds_mysql-basics.tf +++ /dev/null @@ -1,54 +0,0 @@ -resource "datadog_monitor" "rds-mysql_cpu_80_15min" { - name = "[${var.env}] rds Cpu high > 90% for 15 min on {{host.identifier}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - - query = "avg(last_15m):avg:aws.rds.cpuutilization{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} > 90" - type = "query alert" - - thresholds { - warning = "${var.rds_cpu_threshold["warning"]}" - critical = "${var.rds_cpu_threshold["critical"]}" - } - - tags = ["*"] - - notify_no_data = "${var.rds_config["notify_no_data"]}" - evaluation_delay = "${var.rds_config["delay"]}" - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = "${var.rds_config["delay"]}" - no_data_timeframe = 20 -} - -resource "datadog_monitor" "mysql_rds_free_space_low" { - name = "[${var.env}] rds free space low < 10 % on {{host.identifier}}" - message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}" - - type = "query alert" - query = "avg(last_15m):avg:aws.rds.free_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} / avg:aws.rds.total_storage_space{dd_monitoring:enabled,dd_aws_rds:enabled,env:${var.env}} by {region,name} * 100 < 10" - count = "${var.dd_aws_rds == "enabled" ? 1 : 0 }" - - thresholds { - warning = "${var.rds_mem_threshold["warning"]}" - critical = "${var.rds_mem_threshold["critical"]}" - } - - tags = ["*"] - - notify_no_data = "${var.rds_config["notify_no_data"]}" - evaluation_delay = "${var.rds_config["delay"]}" - renotify_interval = 60 - notify_audit = false - timeout_h = 0 - include_tags = true - locked = false - require_full_window = true - new_host_delay = "${var.rds_config["delay"]}" - no_data_timeframe = 20 -} diff --git a/cloud/aws/rds/README.md b/cloud/aws/rds/README.md new file mode 100644 index 0000000..36e8780 --- /dev/null +++ b/cloud/aws/rds/README.md @@ -0,0 +1,45 @@ +AWS RDS Instance DataDog monitors +================================= + +How to use this module +---------------------- + +``` +module "datadog-monitors-aws-rds" { + source = "git::ssh://git@bitbucket.org/morea/terraform.feature.datadog.git//cloud/aws/rds?ref={revision}" + + message = "${module.datadog-message-alerting.alerting-message}" + environment = "${var.environment}" +} +``` + +Purpose +------- +Creates a DataDog monitors with the following checks : + +* CPU High +* Free disk space low + +Inputs +------ + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no | +| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no | +| diskspace_threshold_critical | Disk free space in percent (critical threshold) | string | `10` | no | +| diskspace_threshold_warning | Disk free space in percent (warning threshold) | string | `20` | no | +| environment | Architecture Environment | string | - | yes | +| evaluation_delay | Delay in seconds for the metric evaluation | string | `600` | no | +| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no | +| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no | +| message | Message sent when an alert is triggered | string | - | yes | +| notify_no_data | Enable 'No Data' alert | string | `true` | no | +| renotify_interval | The number of minutes after the last notification before a monitor will re-notify on the current status | string | `60` | no | + +Related documentation +--------------------- + +DataDog documentation: [https://docs.datadoghq.com/integrations/amazon_rds/](https://docs.datadoghq.com/integrations/amazon_rds/) + +AWS RDS Instance metrics documentation: [https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/rds-metricscollected.html](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/rds-metricscollected.html) diff --git a/cloud/aws/rds/inputs.tf b/cloud/aws/rds/inputs.tf new file mode 100644 index 0000000..8e6d89a --- /dev/null +++ b/cloud/aws/rds/inputs.tf @@ -0,0 +1,57 @@ +# Global Terraform +variable "environment" { + description = "Architecture Environment" + type = "string" +} + +# Global DataDog +variable "notify_no_data" { + description = "Enable 'No Data' alert" + default = true +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 600 +} + +variable "renotify_interval" { + description = "The number of minutes after the last notification before a monitor will re-notify on the current status" + default = 60 +} + +variable "message" { + description = "Message sent when an alert is triggered" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +# AWS RDS instance specific + +variable "cpu_threshold_warning" { + description = "CPU usage in percent (warning threshold)" + default = "80" +} + +variable "cpu_threshold_critical" { + description = "CPU usage in percent (critical threshold)" + default = "90" +} + +variable "diskspace_threshold_warning" { + description = "Disk free space in percent (warning threshold)" + default = "20" +} + +variable "diskspace_threshold_critical" { + description = "Disk free space in percent (critical threshold)" + default = "10" +} diff --git a/cloud/aws/rds/monitors-rds-basics.tf b/cloud/aws/rds/monitors-rds-basics.tf new file mode 100644 index 0000000..bb83b46 --- /dev/null +++ b/cloud/aws/rds/monitors-rds-basics.tf @@ -0,0 +1,69 @@ +data "template_file" "filter" { + template = "$${filter}" + + vars { + filter = "${var.filter_tags_use_defaults == "true" ? format("dd_monitoring:enabled,dd_aws_rds:enabled,env:%s", var.environment) : "${var.filter_tags_custom}"}" + } +} + +### RDS instance CPU monitor ### +resource "datadog_monitor" "rds_cpu_90_15min" { + name = "[${var.environment}] RDS instance CPU high > ${var.cpu_threshold_critical}% for 15 min on {{host.identifier}}" + message = "${var.message}" + + type = "metric alert" + query = < ${var.cpu_threshold_critical} +EOF + + thresholds { + warning = "${var.cpu_threshold_warning}" + critical = "${var.cpu_threshold_critical}" + } + + notify_no_data = "${var.notify_no_data}" + evaluation_delay = "${var.evaluation_delay}" + renotify_interval = "${var.renotify_interval}" + notify_audit = false + timeout_h = 0 + include_tags = true + locked = false + require_full_window = true + new_host_delay = "${var.evaluation_delay}" + no_data_timeframe = 20 + + tags = ["env:${var.environment}", "resource:rds", "team:aws", "provider:aws"] +} + +### RDS instance free space monitor ### +resource "datadog_monitor" "rds_free_space_low" { + name = "[${var.environment}] RDS instance free space < ${var.diskspace_threshold_critical}% on {{host.identifier}}" + message = "${var.message}" + + type = "metric alert" + query = <