MON-424 cloud/gcp/gce/instance: init (cpu utilization + disk OPS throttled)

This commit is contained in:
Maxime de Roucy 2019-03-28 16:22:59 +01:00 committed by Quentin Manfroi
parent 770d87c235
commit 819dc38a07
5 changed files with 402 additions and 0 deletions

View File

@ -116,6 +116,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [cloud-sql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/)
- [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/common/)
- [mysql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/mysql/)
- [gce](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/)
- [instance](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/instance/)
- [lb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/lb/)
- [pubsub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/)
- [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/common/)

View File

@ -0,0 +1,70 @@
# CLOUD GCP GCE INSTANCE DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-gcp-gce-instance" {
source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/gcp/gce/instance?ref={revision}"
environment = "${var.environment}"
message = "${module.datadog-message-alerting.alerting-message}"
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Compute Engine instance CPU Utilization
- Compute Engine instance Disk Throttled Bps
- Compute Engine instance Disk Throttled OPS
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| cpu\_utilization\_enabled | Flag to enable CPU Utilization monitor | string | `"true"` | no |
| cpu\_utilization\_extra\_tags | Extra tags for CPU Utilization monitor | list | `[]` | no |
| cpu\_utilization\_message | Custom message for the CPU Utilization monitor | string | `""` | no |
| cpu\_utilization\_silenced | Groups to mute for CPU Utilization monitor | map | `{}` | no |
| cpu\_utilization\_threshold\_critical | CPU Utilization in percentage (critical threshold) | string | `"90"` | no |
| cpu\_utilization\_threshold\_warning | CPU Utilization in percentage (warning threshold) | string | `"80"` | no |
| cpu\_utilization\_time\_aggregator | Time aggregator for the CPU Utilization monitor | string | `"avg"` | no |
| cpu\_utilization\_timeframe | Timeframe for the CPU Utilization monitor | string | `"last_15m"` | no |
| disk\_throttled\_bps\_enabled | Flag to enable Disk Throttled Bps monitor | string | `"true"` | no |
| disk\_throttled\_bps\_extra\_tags | Extra tags for Disk Throttled Bps monitor | list | `[]` | no |
| disk\_throttled\_bps\_message | Custom message for the Disk Throttled Bps monitor | string | `""` | no |
| disk\_throttled\_bps\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled Bps monitor | string | `"false"` | no |
| disk\_throttled\_bps\_silenced | Groups to mute for Disk Throttled Bps monitor | map | `{}` | no |
| disk\_throttled\_bps\_threshold\_critical | Disk Throttled Bps in percentage (critical threshold) | string | `"50"` | no |
| disk\_throttled\_bps\_threshold\_warning | Disk Throttled Bps in percentage (warning threshold) | string | `"30"` | no |
| disk\_throttled\_bps\_time\_aggregator | Time aggregator for the Disk Throttled Bps monitor | string | `"min"` | no |
| disk\_throttled\_bps\_timeframe | Timeframe for the Disk Throttled Bps monitor | string | `"last_15m"` | no |
| disk\_throttled\_ops\_enabled | Flag to enable Disk Throttled OPS monitor | string | `"true"` | no |
| disk\_throttled\_ops\_extra\_tags | Extra tags for Disk Throttled OPS monitor | list | `[]` | no |
| disk\_throttled\_ops\_message | Custom message for the Disk Throttled OPS monitor | string | `""` | no |
| disk\_throttled\_ops\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled OPS monitor | string | `"false"` | no |
| disk\_throttled\_ops\_silenced | Groups to mute for Disk Throttled OPS monitor | map | `{}` | no |
| disk\_throttled\_ops\_threshold\_critical | Disk Throttled OPS in percentage (critical threshold) | string | `"50"` | no |
| disk\_throttled\_ops\_threshold\_warning | Disk Throttled OPS in percentage (warning threshold) | string | `"30"` | no |
| disk\_throttled\_ops\_time\_aggregator | Time aggregator for the Disk Throttled OPS monitor | string | `"min"` | no |
| disk\_throttled\_ops\_timeframe | Timeframe for the Disk Throttled OPS monitor | string | `"last_15m"` | no |
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags | Tags used for filtering | string | `"*"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds for the new host evaluation | string | `"300"` | no |
## Outputs
| Name | Description |
|------|-------------|
| cpu\_utilization\_id | id for monitor cpu_utilization |
| disk\_throttled\_bps\_id | id for monitor disk_throttled_bps |
| disk\_throttled\_ops\_id | id for monitor disk_throttled_ops |
## Related documentation
* [Datadog GCE Instance metrics](https://www.datadoghq.com/blog/monitoring-google-compute-engine-performance/#instance-metrics)
* [GCP Maximum OPS and Bps by device type and size](https://cloud.google.com/compute/docs/disks/)

View File

@ -0,0 +1,194 @@
#
# Datadog global variables
#
variable "environment" {
description = "Architecture environment"
type = "string"
}
variable "filter_tags" {
description = "Tags used for filtering"
default = "*"
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds for the new host evaluation"
default = 300
}
#
# CPU
#
variable "cpu_utilization_message" {
description = "Custom message for the CPU Utilization monitor"
type = "string"
default = ""
}
variable "cpu_utilization_time_aggregator" {
description = "Time aggregator for the CPU Utilization monitor"
type = "string"
default = "avg"
}
variable "cpu_utilization_timeframe" {
description = "Timeframe for the CPU Utilization monitor"
type = "string"
default = "last_15m"
}
variable "cpu_utilization_threshold_warning" {
description = "CPU Utilization in percentage (warning threshold)"
type = "string"
default = 80
}
variable "cpu_utilization_threshold_critical" {
description = "CPU Utilization in percentage (critical threshold)"
type = "string"
default = 90
}
variable "cpu_utilization_silenced" {
description = "Groups to mute for CPU Utilization monitor"
type = "map"
default = {}
}
variable "cpu_utilization_enabled" {
description = "Flag to enable CPU Utilization monitor"
type = "string"
default = "true"
}
variable "cpu_utilization_extra_tags" {
description = "Extra tags for CPU Utilization monitor"
type = "list"
default = []
}
#
# Disk Throttled Bps
#
variable "disk_throttled_bps_message" {
description = "Custom message for the Disk Throttled Bps monitor"
type = "string"
default = ""
}
variable "disk_throttled_bps_time_aggregator" {
description = "Time aggregator for the Disk Throttled Bps monitor"
type = "string"
default = "min"
}
variable "disk_throttled_bps_timeframe" {
description = "Timeframe for the Disk Throttled Bps monitor"
type = "string"
default = "last_15m"
}
variable "disk_throttled_bps_threshold_warning" {
description = "Disk Throttled Bps in percentage (warning threshold)"
type = "string"
default = 30
}
variable "disk_throttled_bps_threshold_critical" {
description = "Disk Throttled Bps in percentage (critical threshold)"
type = "string"
default = 50
}
variable "disk_throttled_bps_silenced" {
description = "Groups to mute for Disk Throttled Bps monitor"
type = "map"
default = {}
}
variable "disk_throttled_bps_enabled" {
description = "Flag to enable Disk Throttled Bps monitor"
type = "string"
default = "true"
}
variable "disk_throttled_bps_extra_tags" {
description = "Extra tags for Disk Throttled Bps monitor"
type = "list"
default = []
}
variable "disk_throttled_bps_notify_no_data" {
description = "Flag to enable notification for no data on Disk Throttled Bps monitor"
type = "string"
default = "false"
}
#
# Disk Throttled OPS
#
variable "disk_throttled_ops_message" {
description = "Custom message for the Disk Throttled OPS monitor"
type = "string"
default = ""
}
variable "disk_throttled_ops_time_aggregator" {
description = "Time aggregator for the Disk Throttled OPS monitor"
type = "string"
default = "min"
}
variable "disk_throttled_ops_timeframe" {
description = "Timeframe for the Disk Throttled OPS monitor"
type = "string"
default = "last_15m"
}
variable "disk_throttled_ops_threshold_warning" {
description = "Disk Throttled OPS in percentage (warning threshold)"
type = "string"
default = 30
}
variable "disk_throttled_ops_threshold_critical" {
description = "Disk Throttled OPS in percentage (critical threshold)"
type = "string"
default = 50
}
variable "disk_throttled_ops_silenced" {
description = "Groups to mute for Disk Throttled OPS monitor"
type = "map"
default = {}
}
variable "disk_throttled_ops_enabled" {
description = "Flag to enable Disk Throttled OPS monitor"
type = "string"
default = "true"
}
variable "disk_throttled_ops_extra_tags" {
description = "Extra tags for Disk Throttled OPS monitor"
type = "list"
default = []
}
variable "disk_throttled_ops_notify_no_data" {
description = "Flag to enable notification for no data on Disk Throttled OPS monitor"
type = "string"
default = "false"
}

View File

@ -0,0 +1,122 @@
#
# CPU Utilization
#
resource "datadog_monitor" "cpu_utilization" {
count = "${var.cpu_utilization_enabled ? 1 : 0}"
name = "[${var.environment}] Compute Engine instance CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.cpu_utilization_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
avg:gcp.gce.instance.cpu.utilization{${var.filter_tags}} by {instance_name} * 100
> ${var.cpu_utilization_threshold_critical}
EOF
thresholds {
warning = "${var.cpu_utilization_threshold_warning}"
critical = "${var.cpu_utilization_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = true
renotify_interval = 0
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
silenced = "${var.cpu_utilization_silenced}"
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.cpu_utilization_extra_tags}"]
}
#
# Disk Throttled Bps
#
resource "datadog_monitor" "disk_throttled_bps" {
count = "${var.disk_throttled_bps_enabled ? 1 : 0}"
name = "[${var.environment}] Compute Engine instance Disk Throttled Bps {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.disk_throttled_bps_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.disk_throttled_bps_time_aggregator}(${var.disk_throttled_bps_timeframe}):
(
sum:gcp.gce.instance.disk.throttled_read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
sum:gcp.gce.instance.disk.throttled_write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
) / (
sum:gcp.gce.instance.disk.read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
sum:gcp.gce.instance.disk.write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
) * 100
> ${var.disk_throttled_bps_threshold_critical}
EOF
thresholds {
warning = "${var.disk_throttled_bps_threshold_warning}"
critical = "${var.disk_throttled_bps_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = "${var.disk_throttled_bps_notify_no_data}"
renotify_interval = 0
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
silenced = "${var.disk_throttled_bps_silenced}"
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_bps_extra_tags}"]
}
#
# Disk Throttled OPS
#
resource "datadog_monitor" "disk_throttled_ops" {
count = "${var.disk_throttled_ops_enabled ? 1 : 0}"
name = "[${var.environment}] Compute Engine instance Disk Throttled OPS {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
message = "${coalesce(var.disk_throttled_ops_message, var.message)}"
type = "metric alert"
query = <<EOF
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
(
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
sum:gcp.gce.instance.disk.throttled_write_ops_count{${var.filter_tags}} by {instance_name, device_name}
) / (
sum:gcp.gce.instance.disk.read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
sum:gcp.gce.instance.disk.write_ops_count{${var.filter_tags}} by {instance_name, device_name}
) * 100
> ${var.disk_throttled_ops_threshold_critical}
EOF
thresholds {
warning = "${var.disk_throttled_ops_threshold_warning}"
critical = "${var.disk_throttled_ops_threshold_critical}"
}
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = "${var.disk_throttled_ops_notify_no_data}"
renotify_interval = 0
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
silenced = "${var.disk_throttled_ops_silenced}"
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_ops_extra_tags}"]
}

View File

@ -0,0 +1,14 @@
output "cpu_utilization_id" {
description = "id for monitor cpu_utilization"
value = "${datadog_monitor.cpu_utilization.*.id}"
}
output "disk_throttled_bps_id" {
description = "id for monitor disk_throttled_bps"
value = "${datadog_monitor.disk_throttled_bps.*.id}"
}
output "disk_throttled_ops_id" {
description = "id for monitor disk_throttled_ops"
value = "${datadog_monitor.disk_throttled_ops.*.id}"
}