Merge branch 'MON-400-gpc-pubsub-p2' into 'master'

MON-400 monitors for pubsub topics and subscriptions

See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!100
This commit is contained in:
Quentin Manfroi 2019-08-29 18:07:35 +02:00
commit 712a5cf226
10 changed files with 599 additions and 80 deletions

View File

@ -184,6 +184,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [instance](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/instance/)
- [lb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/lb/)
- [pubsub](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/)
- [subscription](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/subscription/)
- [topic](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/topic/)
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/)
- [alerting-message](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/alerting-message/)
- [filter-tags](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/filter-tags/)

View File

@ -1,73 +0,0 @@
#
# Sending Operations Count
#
resource "datadog_monitor" "sending_operations_count" {
count = var.sending_operations_count_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP pubsub sending messages operations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.sending_operations_count_message, var.message)
type = "query alert"
query = <<EOQ
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
<= ${var.sending_operations_count_threshold_critical}
EOQ
thresholds = {
critical = var.sending_operations_count_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = true
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "team:claranet", "created-by:terraform"], var.sending_operations_count_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
#
# Unavailable Sending Operations Count
#
resource "datadog_monitor" "unavailable_sending_operations_count" {
count = var.unavailable_sending_operations_count_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP pubsub sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.unavailable_sending_operations_count_message, var.message)
type = "query alert"
query = <<EOQ
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
>= ${var.unavailable_sending_operations_count_threshold_critical}
EOQ
thresholds = {
warning = var.unavailable_sending_operations_count_threshold_warning
critical = var.unavailable_sending_operations_count_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_count_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -0,0 +1,72 @@
# CLOUD GCP PUBSUB SUBSCRIPTION DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-gcp-pubsub-subscription" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub/subscription?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Pub/Sub Subscription latency on push endpoint
- Pub/Sub Subscription latency on push endpoint changed abnormally (disabled by default)
- Pub/Sub Subscription oldest unacknowledged message
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags | Tags used for filtering | string | `"*"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds for the new host evaluation | string | `"300"` | no |
| oldest\_unacked\_message\_age\_enabled | Flag to enable GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"true"` | no |
| oldest\_unacked\_message\_age\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | list(string) | `[]` | no |
| oldest\_unacked\_message\_age\_message | Custom message for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `""` | no |
| oldest\_unacked\_message\_age\_threshold\_critical | GCP Pub/Sub Subscription Oldest Unacked Message Age critical threshold | string | `"120"` | no |
| oldest\_unacked\_message\_age\_threshold\_warning | GCP Pub/Sub Subscription Oldest Unacked Message Age warning threshold | string | `"30"` | no |
| oldest\_unacked\_message\_age\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"min"` | no |
| oldest\_unacked\_message\_age\_timeframe | Timeframe for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"last_5m"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| subscription\_push\_latency\_anomaly\_alert\_window | Alert window. | string | `"last_15m"` | no |
| subscription\_push\_latency\_anomaly\_count\_default\_zero | Count default zero. | string | `"true"` | no |
| subscription\_push\_latency\_anomaly\_detection\_algorithm | Anomaly Detection Algorithm used | string | `"basic"` | no |
| subscription\_push\_latency\_anomaly\_direction | Direction of the anomaly. It can be both, below or above. | string | `"above"` | no |
| subscription\_push\_latency\_anomaly\_enabled | Flag to enable GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"false"` | no |
| subscription\_push\_latency\_anomaly\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Push Latency Anomaly monitor | list(string) | `[]` | no |
| subscription\_push\_latency\_anomaly\_interval | Interval. | string | `"60"` | no |
| subscription\_push\_latency\_anomaly\_message | Custom message for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `""` | no |
| subscription\_push\_latency\_anomaly\_seasonality | Seasonality of the algorithm | string | `"daily"` | no |
| subscription\_push\_latency\_anomaly\_threshold\_critical | GCP Pub/Sub Subscription Push Latency Anomaly critical threshold | string | `"2"` | no |
| subscription\_push\_latency\_anomaly\_threshold\_warning | GCP Pub/Sub Subscription Push Latency Anomaly warning threshold | string | `"1"` | no |
| subscription\_push\_latency\_anomaly\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"avg"` | no |
| subscription\_push\_latency\_anomaly\_timeframe | Timeframe for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"last_10m"` | no |
| subscription\_push\_latency\_enabled | Flag to enable GCP Pub/Sub Subscription Push Latency High monitor | string | `"true"` | no |
| subscription\_push\_latency\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Push Latency High monitor | list(string) | `[]` | no |
| subscription\_push\_latency\_message | Custom message for the GCP Pub/Sub Subscription Push Latency High monitor | string | `""` | no |
| subscription\_push\_latency\_threshold\_critical | GCP Pub/Sub Subscription Push Latency High critical threshold | string | `"5000"` | no |
| subscription\_push\_latency\_threshold\_warning | GCP Pub/Sub Subscription Push Latency High warning threshold | string | `"1000"` | no |
| subscription\_push\_latency\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Push Latency High monitor | string | `"avg"` | no |
| subscription\_push\_latency\_timeframe | Timeframe for the GCP Pub/Sub Subscription Push Latency High monitor | string | `"last_10m"` | no |
## Outputs
| Name | Description |
|------|-------------|
| oldest\_unacked\_message\_age\_id | id for monitor oldest_unacked_message_age |
| subscription\_push\_latency\_anomaly\_id | id for monitor subscription_push_latency_anomaly |
| subscription\_push\_latency\_id | id for monitor subscription_push_latency |
## Related documentation
* [GCP Pub/Sub Metrics](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-pubsub)
* [Datadog GCP Pub/Sub integration](https://docs.datadoghq.com/integrations/google_cloud_pubsub/)

View File

@ -0,0 +1,206 @@
#
# Datadog global variables
#
variable "environment" {
description = "Architecture environment"
type = string
}
variable "filter_tags" {
description = "Tags used for filtering"
default = "*"
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds for the new host evaluation"
default = 300
}
variable "prefix_slug" {
description = "Prefix string to prepend between brackets on every monitors names"
default = ""
}
#
# oldest_unacked_message_age
#
variable "oldest_unacked_message_age_enabled" {
description = "Flag to enable GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
type = string
default = "true"
}
variable "oldest_unacked_message_age_message" {
description = "Custom message for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
type = string
default = ""
}
variable "oldest_unacked_message_age_time_aggregator" {
description = "Time aggregator for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
type = string
default = "min"
}
variable "oldest_unacked_message_age_timeframe" {
description = "Timeframe for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
type = string
default = "last_5m"
}
variable "oldest_unacked_message_age_threshold_warning" {
description = "GCP Pub/Sub Subscription Oldest Unacked Message Age warning threshold"
type = string
default = 30
}
variable "oldest_unacked_message_age_threshold_critical" {
description = "GCP Pub/Sub Subscription Oldest Unacked Message Age critical threshold"
type = string
default = 120
}
variable "oldest_unacked_message_age_extra_tags" {
description = "Extra tags for GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
type = list(string)
default = []
}
#
# subscription_push_latency
#
variable "subscription_push_latency_enabled" {
description = "Flag to enable GCP Pub/Sub Subscription Push Latency High monitor"
type = string
default = "true"
}
variable "subscription_push_latency_message" {
description = "Custom message for the GCP Pub/Sub Subscription Push Latency High monitor"
type = string
default = ""
}
variable "subscription_push_latency_time_aggregator" {
description = "Time aggregator for the GCP Pub/Sub Subscription Push Latency High monitor"
type = string
default = "avg"
}
variable "subscription_push_latency_timeframe" {
description = "Timeframe for the GCP Pub/Sub Subscription Push Latency High monitor"
type = string
default = "last_10m"
}
variable "subscription_push_latency_threshold_warning" {
description = "GCP Pub/Sub Subscription Push Latency High warning threshold"
type = string
default = 1000
}
variable "subscription_push_latency_threshold_critical" {
description = "GCP Pub/Sub Subscription Push Latency High critical threshold"
type = string
default = 5000
}
variable "subscription_push_latency_extra_tags" {
description = "Extra tags for GCP Pub/Sub Subscription Push Latency High monitor"
type = list(string)
default = []
}
#
# subscription_push_latency_anomaly
#
variable "subscription_push_latency_anomaly_enabled" {
description = "Flag to enable GCP Pub/Sub Subscription Push Latency Anomaly monitor"
type = string
default = "false"
}
variable "subscription_push_latency_anomaly_message" {
description = "Custom message for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
type = string
default = ""
}
variable "subscription_push_latency_anomaly_time_aggregator" {
description = "Time aggregator for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
type = string
default = "avg"
}
variable "subscription_push_latency_anomaly_timeframe" {
description = "Timeframe for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
type = string
default = "last_10m"
}
variable "subscription_push_latency_anomaly_detection_algorithm" {
description = "Anomaly Detection Algorithm used"
type = string
default = "basic"
}
variable "subscription_push_latency_anomaly_direction" {
description = "Direction of the anomaly. It can be both, below or above."
type = string
default = "above"
}
variable "subscription_push_latency_anomaly_alert_window" {
description = "Alert window."
type = string
default = "last_15m"
}
variable "subscription_push_latency_anomaly_interval" {
description = "Interval."
type = string
default = 60
}
variable "subscription_push_latency_anomaly_count_default_zero" {
description = "Count default zero."
type = string
default = "true"
}
variable "subscription_push_latency_anomaly_seasonality" {
description = "Seasonality of the algorithm"
type = string
default = "daily"
}
variable "subscription_push_latency_anomaly_threshold_warning" {
description = "GCP Pub/Sub Subscription Push Latency Anomaly warning threshold"
type = string
default = 1
}
variable "subscription_push_latency_anomaly_threshold_critical" {
description = "GCP Pub/Sub Subscription Push Latency Anomaly critical threshold"
type = string
default = 2
}
variable "subscription_push_latency_anomaly_extra_tags" {
description = "Extra tags for GCP Pub/Sub Subscription Push Latency Anomaly monitor"
type = list(string)
default = []
}

View File

@ -0,0 +1,128 @@
######################
# All Subscriptions #
######################
#
# oldest_unacked_message_age
#
resource "datadog_monitor" "oldest_unacked_message_age" {
count = var.oldest_unacked_message_age_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Subscription oldest unacknowledged message {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = coalesce(var.oldest_unacked_message_age_message, var.message)
type = "query alert"
query = <<EOQ
${var.oldest_unacked_message_age_time_aggregator}(${var.oldest_unacked_message_age_timeframe}):
avg:gcp.pubsub.subscription.oldest_unacked_message_age{${var.filter_tags}} by {subscription_id}
>= ${var.oldest_unacked_message_age_threshold_critical}
EOQ
thresholds = {
warning = var.oldest_unacked_message_age_threshold_warning
critical = var.oldest_unacked_message_age_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:subscription", "team:claranet", "created-by:terraform"], var.oldest_unacked_message_age_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
######################
# Push Subscriptions #
######################
#
# subscription_push_latency
#
resource "datadog_monitor" "subscription_push_latency" {
count = var.subscription_push_latency_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Subscription latency on push endpoint {{#is_alert}}{{{comparator}}} {{threshold}}s ({{value}}s){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}s ({{value}}s){{/is_warning}}"
message = coalesce(var.subscription_push_latency_message, var.message)
type = "query alert"
query = <<EOQ
${var.subscription_push_latency_time_aggregator}(${var.subscription_push_latency_timeframe}):
avg:gcp.pubsub.subscription.push_request_latencies.avg{${var.filter_tags}} by {subscription_id}
>= ${var.subscription_push_latency_threshold_critical}
EOQ
thresholds = {
warning = var.subscription_push_latency_threshold_warning
critical = var.subscription_push_latency_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:subscription", "team:claranet", "created-by:terraform"], var.subscription_push_latency_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
#
# subscription_push_latency_anomaly
#
resource "datadog_monitor" "subscription_push_latency_anomaly" {
count = var.subscription_push_latency_anomaly_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Subscription latency on push endpoint changed abnormally {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.subscription_push_latency_anomaly_message, var.message)
type = "query alert"
query = <<EOQ
${var.subscription_push_latency_anomaly_time_aggregator}(${var.subscription_push_latency_anomaly_timeframe}):
anomalies(
avg:gcp.pubsub.subscription.push_request_latencies.avg{${var.filter_tags}} by {subscription_id}
'${var.subscription_push_latency_anomaly_detection_algorithm}',
avg: gcp.pubsub.subscription.push_request_latencies.sumsqdev{${var.filter_tags}} by {subscription_id},
direction='${var.subscription_push_latency_anomaly_direction}',
alert_window='${var.subscription_push_latency_anomaly_alert_window}',
interval=${var.subscription_push_latency_anomaly_interval},
count_default_zero='${var.subscription_push_latency_anomaly_count_default_zero}'
${var.subscription_push_latency_anomaly_seasonality == "agile" ? format(",seasonality='%s'", var.subscription_push_latency_anomaly_seasonality) : ""}
)
>= ${var.subscription_push_latency_anomaly_threshold_critical}
EOQ
thresholds = {
warning = var.subscription_push_latency_anomaly_threshold_warning
critical = var.subscription_push_latency_anomaly_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:subscription", "team:claranet", "created-by:terraform"], var.subscription_push_latency_anomaly_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -0,0 +1,15 @@
output "oldest_unacked_message_age_id" {
description = "id for monitor oldest_unacked_message_age"
value = datadog_monitor.oldest_unacked_message_age.*.id
}
output "subscription_push_latency_id" {
description = "id for monitor subscription_push_latency"
value = datadog_monitor.subscription_push_latency.*.id
}
output "subscription_push_latency_anomaly_id" {
description = "id for monitor subscription_push_latency_anomaly"
value = datadog_monitor.subscription_push_latency_anomaly.*.id
}

View File

@ -1,10 +1,10 @@
# CLOUD GCP PUBSUB DataDog monitors
# CLOUD GCP PUBSUB TOPIC DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-gcp-pubsub" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub?ref={revision}"
module "datadog-monitors-cloud-gcp-pubsub-topic" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub/topic?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
@ -16,8 +16,9 @@ module "datadog-monitors-cloud-gcp-pubsub" {
Creates DataDog monitors with the following checks:
- GCP pubsub sending messages operations
- GCP pubsub sending messages with result unavailable
- Pub/Sub Topic ratio of sending messages with result unavailable
- Pub/Sub Topic sending messages operations
- Pub/Sub Topic sending messages with result unavailable (disabled by default)
## Inputs
@ -35,13 +36,20 @@ Creates DataDog monitors with the following checks:
| sending\_operations\_count\_threshold\_critical | Critical threshold for the number of sending operations. | string | `"0"` | no |
| sending\_operations\_count\_time\_aggregator | Timeframe for the GCP Pub/Sub Sending Operations Count monitor | string | `"sum"` | no |
| sending\_operations\_count\_timeframe | Timeframe for the GCP Pub/Sub Sending Operations Count monitor | string | `"last_30m"` | no |
| unavailable\_sending\_operations\_count\_enabled | Flag to enable GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"true"` | no |
| unavailable\_sending\_operations\_count\_enabled | Flag to enable GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"false"` | no |
| unavailable\_sending\_operations\_count\_extra\_tags | Extra tags for GCP Pub/Sub Unavailable Sending Operations Count monitor | list(string) | `[]` | no |
| unavailable\_sending\_operations\_count\_message | Custom message for the GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `""` | no |
| unavailable\_sending\_operations\_count\_threshold\_critical | Critical threshold for the number of unavailable sending operations | string | `"4"` | no |
| unavailable\_sending\_operations\_count\_threshold\_warning | Warning threshold for the number of unavailable sending operations | string | `"2"` | no |
| unavailable\_sending\_operations\_count\_time\_aggregator | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"sum"` | no |
| unavailable\_sending\_operations\_count\_timeframe | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"last_10m"` | no |
| unavailable\_sending\_operations\_ratio\_enabled | Flag to enable GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"true"` | no |
| unavailable\_sending\_operations\_ratio\_extra\_tags | Extra tags for GCP Pub/Sub Unavailable Sending Operations Ratio monitor | list(string) | `[]` | no |
| unavailable\_sending\_operations\_ratio\_message | Custom message for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `""` | no |
| unavailable\_sending\_operations\_ratio\_threshold\_critical | Critical threshold (%) for the ratio of unavailable sending operations | string | `"20"` | no |
| unavailable\_sending\_operations\_ratio\_threshold\_warning | Warning threshold (%) for the ratio of unavailable sending operations | string | `"10"` | no |
| unavailable\_sending\_operations\_ratio\_time\_aggregator | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"sum"` | no |
| unavailable\_sending\_operations\_ratio\_timeframe | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"last_10m"` | no |
## Outputs
@ -49,6 +57,7 @@ Creates DataDog monitors with the following checks:
|------|-------------|
| sending\_operations\_count\_id | id for monitor sending_operations_count |
| unavailable\_sending\_operations\_count\_id | id for monitor unavailable_sending_operations_count |
| unavailable\_sending\_operations\_ratio\_id | id for monitor unavailable_sending_operations_ratio |
## Related documentation

View File

@ -105,7 +105,7 @@ variable "unavailable_sending_operations_count_threshold_critical" {
variable "unavailable_sending_operations_count_enabled" {
description = "Flag to enable GCP Pub/Sub Unavailable Sending Operations Count monitor"
type = string
default = "true"
default = "false"
}
variable "unavailable_sending_operations_count_extra_tags" {
@ -114,3 +114,47 @@ variable "unavailable_sending_operations_count_extra_tags" {
default = []
}
#
# Unavailable Sending Operations Ratio
#
variable "unavailable_sending_operations_ratio_message" {
description = "Custom message for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
type = string
default = ""
}
variable "unavailable_sending_operations_ratio_time_aggregator" {
description = "Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
type = string
default = "sum"
}
variable "unavailable_sending_operations_ratio_timeframe" {
description = "Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
type = string
default = "last_10m"
}
variable "unavailable_sending_operations_ratio_threshold_warning" {
description = "Warning threshold (%) for the ratio of unavailable sending operations"
type = string
default = 10
}
variable "unavailable_sending_operations_ratio_threshold_critical" {
description = "Critical threshold (%) for the ratio of unavailable sending operations"
type = string
default = 20
}
variable "unavailable_sending_operations_ratio_enabled" {
description = "Flag to enable GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
type = string
default = "true"
}
variable "unavailable_sending_operations_ratio_extra_tags" {
description = "Extra tags for GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
type = list(string)
default = []
}

View File

@ -0,0 +1,111 @@
#
# Sending Operations Count
#
resource "datadog_monitor" "sending_operations_count" {
count = var.sending_operations_count_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Topic sending messages operations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.sending_operations_count_message, var.message)
type = "query alert"
query = <<EOQ
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
<= ${var.sending_operations_count_threshold_critical}
EOQ
thresholds = {
critical = var.sending_operations_count_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = true
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:topic", "team:claranet", "created-by:terraform"], var.sending_operations_count_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
#
# Unavailable Sending Operations Count
#
resource "datadog_monitor" "unavailable_sending_operations_count" {
count = var.unavailable_sending_operations_count_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Topic sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.unavailable_sending_operations_count_message, var.message)
type = "query alert"
query = <<EOQ
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
>= ${var.unavailable_sending_operations_count_threshold_critical}
EOQ
thresholds = {
warning = var.unavailable_sending_operations_count_threshold_warning
critical = var.unavailable_sending_operations_count_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:topic", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_count_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
#
# Unavailable Sending Operations Ratio
#
resource "datadog_monitor" "unavailable_sending_operations_ratio" {
count = var.unavailable_sending_operations_ratio_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Pub/Sub Topic ratio of sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
message = coalesce(var.unavailable_sending_operations_ratio_message, var.message)
type = "query alert"
query = <<EOQ
${var.unavailable_sending_operations_ratio_time_aggregator}(${var.unavailable_sending_operations_ratio_timeframe}):
(100 * default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0))
/
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
>= ${var.unavailable_sending_operations_ratio_threshold_critical}
EOQ
thresholds = {
warning = var.unavailable_sending_operations_ratio_threshold_warning
critical = var.unavailable_sending_operations_ratio_threshold_critical
}
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
notify_audit = false
locked = false
timeout_h = 0
include_tags = true
require_full_window = false
notify_no_data = false
renotify_interval = 0
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "category:topic", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_ratio_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -8,3 +8,8 @@ output "unavailable_sending_operations_count_id" {
value = datadog_monitor.unavailable_sending_operations_count.*.id
}
output "unavailable_sending_operations_ratio_id" {
description = "id for monitor unavailable_sending_operations_ratio"
value = datadog_monitor.unavailable_sending_operations_ratio.*.id
}