MON-424 cloud/gcp/gce/instance: init (cpu utilization + disk OPS throttled)
This commit is contained in:
parent
770d87c235
commit
819dc38a07
@ -116,6 +116,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
|
||||
- [cloud-sql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/)
|
||||
- [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/common/)
|
||||
- [mysql](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/cloud-sql/mysql/)
|
||||
- [gce](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/)
|
||||
- [instance](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/instance/)
|
||||
- [lb](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/lb/)
|
||||
- [pubsub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/)
|
||||
- [common](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/common/)
|
||||
|
||||
70
cloud/gcp/gce/instance/README.md
Normal file
70
cloud/gcp/gce/instance/README.md
Normal file
@ -0,0 +1,70 @@
|
||||
# CLOUD GCP GCE INSTANCE DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-gcp-gce-instance" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/gcp/gce/instance?ref={revision}"
|
||||
|
||||
environment = "${var.environment}"
|
||||
message = "${module.datadog-message-alerting.alerting-message}"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- Compute Engine instance CPU Utilization
|
||||
- Compute Engine instance Disk Throttled Bps
|
||||
- Compute Engine instance Disk Throttled OPS
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| cpu\_utilization\_enabled | Flag to enable CPU Utilization monitor | string | `"true"` | no |
|
||||
| cpu\_utilization\_extra\_tags | Extra tags for CPU Utilization monitor | list | `[]` | no |
|
||||
| cpu\_utilization\_message | Custom message for the CPU Utilization monitor | string | `""` | no |
|
||||
| cpu\_utilization\_silenced | Groups to mute for CPU Utilization monitor | map | `{}` | no |
|
||||
| cpu\_utilization\_threshold\_critical | CPU Utilization in percentage (critical threshold) | string | `"90"` | no |
|
||||
| cpu\_utilization\_threshold\_warning | CPU Utilization in percentage (warning threshold) | string | `"80"` | no |
|
||||
| cpu\_utilization\_time\_aggregator | Time aggregator for the CPU Utilization monitor | string | `"avg"` | no |
|
||||
| cpu\_utilization\_timeframe | Timeframe for the CPU Utilization monitor | string | `"last_15m"` | no |
|
||||
| disk\_throttled\_bps\_enabled | Flag to enable Disk Throttled Bps monitor | string | `"true"` | no |
|
||||
| disk\_throttled\_bps\_extra\_tags | Extra tags for Disk Throttled Bps monitor | list | `[]` | no |
|
||||
| disk\_throttled\_bps\_message | Custom message for the Disk Throttled Bps monitor | string | `""` | no |
|
||||
| disk\_throttled\_bps\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled Bps monitor | string | `"false"` | no |
|
||||
| disk\_throttled\_bps\_silenced | Groups to mute for Disk Throttled Bps monitor | map | `{}` | no |
|
||||
| disk\_throttled\_bps\_threshold\_critical | Disk Throttled Bps in percentage (critical threshold) | string | `"50"` | no |
|
||||
| disk\_throttled\_bps\_threshold\_warning | Disk Throttled Bps in percentage (warning threshold) | string | `"30"` | no |
|
||||
| disk\_throttled\_bps\_time\_aggregator | Time aggregator for the Disk Throttled Bps monitor | string | `"min"` | no |
|
||||
| disk\_throttled\_bps\_timeframe | Timeframe for the Disk Throttled Bps monitor | string | `"last_15m"` | no |
|
||||
| disk\_throttled\_ops\_enabled | Flag to enable Disk Throttled OPS monitor | string | `"true"` | no |
|
||||
| disk\_throttled\_ops\_extra\_tags | Extra tags for Disk Throttled OPS monitor | list | `[]` | no |
|
||||
| disk\_throttled\_ops\_message | Custom message for the Disk Throttled OPS monitor | string | `""` | no |
|
||||
| disk\_throttled\_ops\_notify\_no\_data | Flag to enable notification for no data on Disk Throttled OPS monitor | string | `"false"` | no |
|
||||
| disk\_throttled\_ops\_silenced | Groups to mute for Disk Throttled OPS monitor | map | `{}` | no |
|
||||
| disk\_throttled\_ops\_threshold\_critical | Disk Throttled OPS in percentage (critical threshold) | string | `"50"` | no |
|
||||
| disk\_throttled\_ops\_threshold\_warning | Disk Throttled OPS in percentage (warning threshold) | string | `"30"` | no |
|
||||
| disk\_throttled\_ops\_time\_aggregator | Time aggregator for the Disk Throttled OPS monitor | string | `"min"` | no |
|
||||
| disk\_throttled\_ops\_timeframe | Timeframe for the Disk Throttled OPS monitor | string | `"last_15m"` | no |
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| filter\_tags | Tags used for filtering | string | `"*"` | no |
|
||||
| message | Message sent when a monitor is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds for the new host evaluation | string | `"300"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| cpu\_utilization\_id | id for monitor cpu_utilization |
|
||||
| disk\_throttled\_bps\_id | id for monitor disk_throttled_bps |
|
||||
| disk\_throttled\_ops\_id | id for monitor disk_throttled_ops |
|
||||
|
||||
## Related documentation
|
||||
|
||||
* [Datadog GCE Instance metrics](https://www.datadoghq.com/blog/monitoring-google-compute-engine-performance/#instance-metrics)
|
||||
* [GCP Maximum OPS and Bps by device type and size](https://cloud.google.com/compute/docs/disks/)
|
||||
194
cloud/gcp/gce/instance/inputs.tf
Normal file
194
cloud/gcp/gce/instance/inputs.tf
Normal file
@ -0,0 +1,194 @@
|
||||
#
|
||||
# Datadog global variables
|
||||
#
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = "string"
|
||||
}
|
||||
|
||||
variable "filter_tags" {
|
||||
description = "Tags used for filtering"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds for the new host evaluation"
|
||||
default = 300
|
||||
}
|
||||
|
||||
#
|
||||
# CPU
|
||||
#
|
||||
|
||||
variable "cpu_utilization_message" {
|
||||
description = "Custom message for the CPU Utilization monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "cpu_utilization_time_aggregator" {
|
||||
description = "Time aggregator for the CPU Utilization monitor"
|
||||
type = "string"
|
||||
default = "avg"
|
||||
}
|
||||
|
||||
variable "cpu_utilization_timeframe" {
|
||||
description = "Timeframe for the CPU Utilization monitor"
|
||||
type = "string"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "cpu_utilization_threshold_warning" {
|
||||
description = "CPU Utilization in percentage (warning threshold)"
|
||||
type = "string"
|
||||
default = 80
|
||||
}
|
||||
|
||||
variable "cpu_utilization_threshold_critical" {
|
||||
description = "CPU Utilization in percentage (critical threshold)"
|
||||
type = "string"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "cpu_utilization_silenced" {
|
||||
description = "Groups to mute for CPU Utilization monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "cpu_utilization_enabled" {
|
||||
description = "Flag to enable CPU Utilization monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "cpu_utilization_extra_tags" {
|
||||
description = "Extra tags for CPU Utilization monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
#
|
||||
# Disk Throttled Bps
|
||||
#
|
||||
|
||||
variable "disk_throttled_bps_message" {
|
||||
description = "Custom message for the Disk Throttled Bps monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_time_aggregator" {
|
||||
description = "Time aggregator for the Disk Throttled Bps monitor"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_timeframe" {
|
||||
description = "Timeframe for the Disk Throttled Bps monitor"
|
||||
type = "string"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_threshold_warning" {
|
||||
description = "Disk Throttled Bps in percentage (warning threshold)"
|
||||
type = "string"
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_threshold_critical" {
|
||||
description = "Disk Throttled Bps in percentage (critical threshold)"
|
||||
type = "string"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_silenced" {
|
||||
description = "Groups to mute for Disk Throttled Bps monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_enabled" {
|
||||
description = "Flag to enable Disk Throttled Bps monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_extra_tags" {
|
||||
description = "Extra tags for Disk Throttled Bps monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "disk_throttled_bps_notify_no_data" {
|
||||
description = "Flag to enable notification for no data on Disk Throttled Bps monitor"
|
||||
type = "string"
|
||||
default = "false"
|
||||
}
|
||||
|
||||
#
|
||||
# Disk Throttled OPS
|
||||
#
|
||||
|
||||
variable "disk_throttled_ops_message" {
|
||||
description = "Custom message for the Disk Throttled OPS monitor"
|
||||
type = "string"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_time_aggregator" {
|
||||
description = "Time aggregator for the Disk Throttled OPS monitor"
|
||||
type = "string"
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_timeframe" {
|
||||
description = "Timeframe for the Disk Throttled OPS monitor"
|
||||
type = "string"
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_threshold_warning" {
|
||||
description = "Disk Throttled OPS in percentage (warning threshold)"
|
||||
type = "string"
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_threshold_critical" {
|
||||
description = "Disk Throttled OPS in percentage (critical threshold)"
|
||||
type = "string"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_silenced" {
|
||||
description = "Groups to mute for Disk Throttled OPS monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_enabled" {
|
||||
description = "Flag to enable Disk Throttled OPS monitor"
|
||||
type = "string"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_extra_tags" {
|
||||
description = "Extra tags for Disk Throttled OPS monitor"
|
||||
type = "list"
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "disk_throttled_ops_notify_no_data" {
|
||||
description = "Flag to enable notification for no data on Disk Throttled OPS monitor"
|
||||
type = "string"
|
||||
default = "false"
|
||||
}
|
||||
122
cloud/gcp/gce/instance/monitors-gce-instance.tf
Normal file
122
cloud/gcp/gce/instance/monitors-gce-instance.tf
Normal file
@ -0,0 +1,122 @@
|
||||
#
|
||||
# CPU Utilization
|
||||
#
|
||||
resource "datadog_monitor" "cpu_utilization" {
|
||||
count = "${var.cpu_utilization_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Compute Engine instance CPU Utilization {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.cpu_utilization_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}):
|
||||
avg:gcp.gce.instance.cpu.utilization{${var.filter_tags}} by {instance_name} * 100
|
||||
> ${var.cpu_utilization_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.cpu_utilization_threshold_warning}"
|
||||
critical = "${var.cpu_utilization_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = true
|
||||
renotify_interval = 0
|
||||
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
silenced = "${var.cpu_utilization_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.cpu_utilization_extra_tags}"]
|
||||
}
|
||||
|
||||
#
|
||||
# Disk Throttled Bps
|
||||
#
|
||||
resource "datadog_monitor" "disk_throttled_bps" {
|
||||
count = "${var.disk_throttled_bps_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Compute Engine instance Disk Throttled Bps {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.disk_throttled_bps_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
${var.disk_throttled_bps_time_aggregator}(${var.disk_throttled_bps_timeframe}):
|
||||
(
|
||||
sum:gcp.gce.instance.disk.throttled_read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||
sum:gcp.gce.instance.disk.throttled_write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
|
||||
) / (
|
||||
sum:gcp.gce.instance.disk.read_bytes_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||
sum:gcp.gce.instance.disk.write_bytes_count{${var.filter_tags}} by {instance_name, device_name}
|
||||
) * 100
|
||||
> ${var.disk_throttled_bps_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.disk_throttled_bps_threshold_warning}"
|
||||
critical = "${var.disk_throttled_bps_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = "${var.disk_throttled_bps_notify_no_data}"
|
||||
renotify_interval = 0
|
||||
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
silenced = "${var.disk_throttled_bps_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_bps_extra_tags}"]
|
||||
}
|
||||
|
||||
#
|
||||
# Disk Throttled OPS
|
||||
#
|
||||
resource "datadog_monitor" "disk_throttled_ops" {
|
||||
count = "${var.disk_throttled_ops_enabled ? 1 : 0}"
|
||||
name = "[${var.environment}] Compute Engine instance Disk Throttled OPS {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
message = "${coalesce(var.disk_throttled_ops_message, var.message)}"
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
query = <<EOF
|
||||
${var.disk_throttled_ops_time_aggregator}(${var.disk_throttled_ops_timeframe}):
|
||||
(
|
||||
sum:gcp.gce.instance.disk.throttled_read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||
sum:gcp.gce.instance.disk.throttled_write_ops_count{${var.filter_tags}} by {instance_name, device_name}
|
||||
) / (
|
||||
sum:gcp.gce.instance.disk.read_ops_count{${var.filter_tags}} by {instance_name, device_name} +
|
||||
sum:gcp.gce.instance.disk.write_ops_count{${var.filter_tags}} by {instance_name, device_name}
|
||||
) * 100
|
||||
> ${var.disk_throttled_ops_threshold_critical}
|
||||
EOF
|
||||
|
||||
thresholds {
|
||||
warning = "${var.disk_throttled_ops_threshold_warning}"
|
||||
critical = "${var.disk_throttled_ops_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = "${var.disk_throttled_ops_notify_no_data}"
|
||||
renotify_interval = 0
|
||||
|
||||
evaluation_delay = "${var.evaluation_delay}"
|
||||
new_host_delay = "${var.new_host_delay}"
|
||||
|
||||
silenced = "${var.disk_throttled_ops_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "type:cloud", "provider:gcp", "resource:gce-instance", "team:claranet", "created-by:terraform", "${var.disk_throttled_ops_extra_tags}"]
|
||||
}
|
||||
14
cloud/gcp/gce/instance/outputs.tf
Normal file
14
cloud/gcp/gce/instance/outputs.tf
Normal file
@ -0,0 +1,14 @@
|
||||
output "cpu_utilization_id" {
|
||||
description = "id for monitor cpu_utilization"
|
||||
value = "${datadog_monitor.cpu_utilization.*.id}"
|
||||
}
|
||||
|
||||
output "disk_throttled_bps_id" {
|
||||
description = "id for monitor disk_throttled_bps"
|
||||
value = "${datadog_monitor.disk_throttled_bps.*.id}"
|
||||
}
|
||||
|
||||
output "disk_throttled_ops_id" {
|
||||
description = "id for monitor disk_throttled_ops"
|
||||
value = "${datadog_monitor.disk_throttled_ops.*.id}"
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user