MON-400 monitors for pubsub topics and subscriptions
This commit is contained in:
parent
1c809454d7
commit
a2ac8989bf
@ -184,6 +184,8 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
|
||||
- [instance](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/gce/instance/)
|
||||
- [lb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/lb/)
|
||||
- [pubsub](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/)
|
||||
- [subscription](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/subscription/)
|
||||
- [topic](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/gcp/pubsub/topic/)
|
||||
- [common](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/)
|
||||
- [alerting-message](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/alerting-message/)
|
||||
- [filter-tags](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/common/filter-tags/)
|
||||
|
||||
@ -1,73 +0,0 @@
|
||||
#
|
||||
# Sending Operations Count
|
||||
#
|
||||
resource "datadog_monitor" "sending_operations_count" {
|
||||
count = var.sending_operations_count_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP pubsub sending messages operations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.sending_operations_count_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
|
||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
|
||||
<= ${var.sending_operations_count_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
critical = var.sending_operations_count_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = true
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "team:claranet", "created-by:terraform"], var.sending_operations_count_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Unavailable Sending Operations Count
|
||||
#
|
||||
resource "datadog_monitor" "unavailable_sending_operations_count" {
|
||||
count = var.unavailable_sending_operations_count_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP pubsub sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.unavailable_sending_operations_count_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
|
||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
|
||||
>= ${var.unavailable_sending_operations_count_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.unavailable_sending_operations_count_threshold_warning
|
||||
critical = var.unavailable_sending_operations_count_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_count_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
72
cloud/gcp/pubsub/subscription/README.md
Normal file
72
cloud/gcp/pubsub/subscription/README.md
Normal file
@ -0,0 +1,72 @@
|
||||
# CLOUD GCP PUBSUB SUBSCRIPTION DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-gcp-pubsub-subscription" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub/subscription?ref={revision}"
|
||||
|
||||
environment = var.environment
|
||||
message = module.datadog-message-alerting.alerting-message
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- GCP Pub/Sub Subscription Anomaly latency on push endpoint (disabled by default)
|
||||
- GCP Pub/Sub Subscription Latency on push endpoint
|
||||
- GCP Pub/Sub Subscription Oldest unacked message is
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| filter\_tags | Tags used for filtering | string | `"*"` | no |
|
||||
| message | Message sent when a monitor is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds for the new host evaluation | string | `"300"` | no |
|
||||
| oldest\_unacked\_message\_age\_enabled | Flag to enable GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"true"` | no |
|
||||
| oldest\_unacked\_message\_age\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | list(string) | `[]` | no |
|
||||
| oldest\_unacked\_message\_age\_message | Custom message for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `""` | no |
|
||||
| oldest\_unacked\_message\_age\_threshold\_critical | GCP Pub/Sub Subscription Oldest Unacked Message Age critical threshold | string | `"120"` | no |
|
||||
| oldest\_unacked\_message\_age\_threshold\_warning | GCP Pub/Sub Subscription Oldest Unacked Message Age warning threshold | string | `"30"` | no |
|
||||
| oldest\_unacked\_message\_age\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"min"` | no |
|
||||
| oldest\_unacked\_message\_age\_timeframe | Timeframe for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor | string | `"last_5m"` | no |
|
||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||
| subscription\_push\_latency\_anomaly\_alert\_window | Alert window. | string | `"last_15m"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_count\_default\_zero | Count default zero. | string | `"true"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_detection\_algorithm | Anomaly Detection Algorithm used | string | `"basic"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_direction | Direction of the anomaly. It can be both, below or above. | string | `"above"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_enabled | Flag to enable GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"false"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Push Latency Anomaly monitor | list(string) | `[]` | no |
|
||||
| subscription\_push\_latency\_anomaly\_interval | Interval. | string | `"60"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_message | Custom message for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `""` | no |
|
||||
| subscription\_push\_latency\_anomaly\_seasonality | Seasonality of the algorithm | string | `"daily"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_threshold\_critical | GCP Pub/Sub Subscription Push Latency Anomaly critical threshold | string | `"2"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_threshold\_warning | GCP Pub/Sub Subscription Push Latency Anomaly warning threshold | string | `"1"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"avg"` | no |
|
||||
| subscription\_push\_latency\_anomaly\_timeframe | Timeframe for the GCP Pub/Sub Subscription Push Latency Anomaly monitor | string | `"last_10m"` | no |
|
||||
| subscription\_push\_latency\_enabled | Flag to enable GCP Pub/Sub Subscription Push Latency High monitor | string | `"true"` | no |
|
||||
| subscription\_push\_latency\_extra\_tags | Extra tags for GCP Pub/Sub Subscription Push Latency High monitor | list(string) | `[]` | no |
|
||||
| subscription\_push\_latency\_message | Custom message for the GCP Pub/Sub Subscription Push Latency High monitor | string | `""` | no |
|
||||
| subscription\_push\_latency\_threshold\_critical | GCP Pub/Sub Subscription Push Latency High critical threshold | string | `"5000"` | no |
|
||||
| subscription\_push\_latency\_threshold\_warning | GCP Pub/Sub Subscription Push Latency High warning threshold | string | `"1000"` | no |
|
||||
| subscription\_push\_latency\_time\_aggregator | Time aggregator for the GCP Pub/Sub Subscription Push Latency High monitor | string | `"avg"` | no |
|
||||
| subscription\_push\_latency\_timeframe | Timeframe for the GCP Pub/Sub Subscription Push Latency High monitor | string | `"last_10m"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| oldest\_unacked\_message\_age\_id | id for monitor oldest_unacked_message_age |
|
||||
| subscription\_push\_latency\_anomaly\_id | id for monitor subscription_push_latency_anomaly |
|
||||
| subscription\_push\_latency\_id | id for monitor subscription_push_latency |
|
||||
|
||||
## Related documentation
|
||||
|
||||
* [GCP Pub/Sub Metrics](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-pubsub)
|
||||
* [Datadog GCP Pub/Sub integration](https://docs.datadoghq.com/integrations/google_cloud_pubsub/)
|
||||
206
cloud/gcp/pubsub/subscription/inputs.tf
Normal file
206
cloud/gcp/pubsub/subscription/inputs.tf
Normal file
@ -0,0 +1,206 @@
|
||||
#
|
||||
# Datadog global variables
|
||||
#
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "filter_tags" {
|
||||
description = "Tags used for filtering"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds for the new host evaluation"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "prefix_slug" {
|
||||
description = "Prefix string to prepend between brackets on every monitors names"
|
||||
default = ""
|
||||
}
|
||||
|
||||
#
|
||||
# oldest_unacked_message_age
|
||||
#
|
||||
|
||||
variable "oldest_unacked_message_age_enabled" {
|
||||
description = "Flag to enable GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_message" {
|
||||
description = "Custom message for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_time_aggregator" {
|
||||
description = "Time aggregator for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
|
||||
type = string
|
||||
default = "min"
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_timeframe" {
|
||||
description = "Timeframe for the GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
|
||||
type = string
|
||||
default = "last_5m"
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_threshold_warning" {
|
||||
description = "GCP Pub/Sub Subscription Oldest Unacked Message Age warning threshold"
|
||||
type = string
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_threshold_critical" {
|
||||
description = "GCP Pub/Sub Subscription Oldest Unacked Message Age critical threshold"
|
||||
type = string
|
||||
default = 120
|
||||
}
|
||||
|
||||
variable "oldest_unacked_message_age_extra_tags" {
|
||||
description = "Extra tags for GCP Pub/Sub Subscription Oldest Unacked Message Age monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
#
|
||||
# subscription_push_latency
|
||||
#
|
||||
variable "subscription_push_latency_enabled" {
|
||||
description = "Flag to enable GCP Pub/Sub Subscription Push Latency High monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_message" {
|
||||
description = "Custom message for the GCP Pub/Sub Subscription Push Latency High monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_time_aggregator" {
|
||||
description = "Time aggregator for the GCP Pub/Sub Subscription Push Latency High monitor"
|
||||
type = string
|
||||
default = "avg"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_timeframe" {
|
||||
description = "Timeframe for the GCP Pub/Sub Subscription Push Latency High monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_threshold_warning" {
|
||||
description = "GCP Pub/Sub Subscription Push Latency High warning threshold"
|
||||
type = string
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_threshold_critical" {
|
||||
description = "GCP Pub/Sub Subscription Push Latency High critical threshold"
|
||||
type = string
|
||||
default = 5000
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_extra_tags" {
|
||||
description = "Extra tags for GCP Pub/Sub Subscription Push Latency High monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
#
|
||||
# subscription_push_latency_anomaly
|
||||
#
|
||||
variable "subscription_push_latency_anomaly_enabled" {
|
||||
description = "Flag to enable GCP Pub/Sub Subscription Push Latency Anomaly monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_message" {
|
||||
description = "Custom message for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_time_aggregator" {
|
||||
description = "Time aggregator for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
|
||||
type = string
|
||||
default = "avg"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_timeframe" {
|
||||
description = "Timeframe for the GCP Pub/Sub Subscription Push Latency Anomaly monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_detection_algorithm" {
|
||||
description = "Anomaly Detection Algorithm used"
|
||||
type = string
|
||||
default = "basic"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_direction" {
|
||||
description = "Direction of the anomaly. It can be both, below or above."
|
||||
type = string
|
||||
default = "above"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_alert_window" {
|
||||
description = "Alert window."
|
||||
type = string
|
||||
default = "last_15m"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_interval" {
|
||||
description = "Interval."
|
||||
type = string
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_count_default_zero" {
|
||||
description = "Count default zero."
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_seasonality" {
|
||||
description = "Seasonality of the algorithm"
|
||||
type = string
|
||||
default = "daily"
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_threshold_warning" {
|
||||
description = "GCP Pub/Sub Subscription Push Latency Anomaly warning threshold"
|
||||
type = string
|
||||
default = 1
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_threshold_critical" {
|
||||
description = "GCP Pub/Sub Subscription Push Latency Anomaly critical threshold"
|
||||
type = string
|
||||
default = 2
|
||||
}
|
||||
|
||||
variable "subscription_push_latency_anomaly_extra_tags" {
|
||||
description = "Extra tags for GCP Pub/Sub Subscription Push Latency Anomaly monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
|
||||
|
||||
128
cloud/gcp/pubsub/subscription/monitors-subscription.tf
Normal file
128
cloud/gcp/pubsub/subscription/monitors-subscription.tf
Normal file
@ -0,0 +1,128 @@
|
||||
######################
|
||||
# All Subscriptions #
|
||||
######################
|
||||
|
||||
#
|
||||
# oldest_unacked_message_age
|
||||
#
|
||||
resource "datadog_monitor" "oldest_unacked_message_age" {
|
||||
count = var.oldest_unacked_message_age_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Subscription Oldest unacked message is {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}s old"
|
||||
message = coalesce(var.oldest_unacked_message_age_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.oldest_unacked_message_age_time_aggregator}(${var.oldest_unacked_message_age_timeframe}):
|
||||
avg:gcp.pubsub.subscription.oldest_unacked_message_age{${var.filter_tags}} by {subscription_id}
|
||||
>= ${var.oldest_unacked_message_age_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.oldest_unacked_message_age_threshold_warning
|
||||
critical = var.oldest_unacked_message_age_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:subscription", "team:claranet", "created-by:terraform"], var.oldest_unacked_message_age_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
######################
|
||||
# Push Subscriptions #
|
||||
######################
|
||||
|
||||
#
|
||||
# subscription_push_latency
|
||||
#
|
||||
resource "datadog_monitor" "subscription_push_latency" {
|
||||
count = var.subscription_push_latency_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Subscription Latency on push endpoint {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.subscription_push_latency_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.subscription_push_latency_time_aggregator}(${var.subscription_push_latency_timeframe}):
|
||||
avg:gcp.pubsub.subscription.push_request_latencies.avg{${var.filter_tags}} by {subscription_id}
|
||||
>= ${var.subscription_push_latency_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.subscription_push_latency_threshold_warning
|
||||
critical = var.subscription_push_latency_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:subscription", "team:claranet", "created-by:terraform"], var.subscription_push_latency_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# subscription_push_latency_anomaly
|
||||
#
|
||||
resource "datadog_monitor" "subscription_push_latency_anomaly" {
|
||||
count = var.subscription_push_latency_anomaly_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Subscription Anomaly latency on push endpoint {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.subscription_push_latency_anomaly_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.subscription_push_latency_anomaly_time_aggregator}(${var.subscription_push_latency_anomaly_timeframe}):
|
||||
anomalies(
|
||||
avg:gcp.pubsub.subscription.push_request_latencies.avg{${var.filter_tags}} by {subscription_id}
|
||||
'${var.subscription_push_latency_anomaly_detection_algorithm}',
|
||||
avg: gcp.pubsub.subscription.push_request_latencies.sumsqdev{${var.filter_tags}} by {subscription_id},
|
||||
direction='${var.subscription_push_latency_anomaly_direction}',
|
||||
alert_window='${var.subscription_push_latency_anomaly_alert_window}',
|
||||
interval=${var.subscription_push_latency_anomaly_interval},
|
||||
count_default_zero='${var.subscription_push_latency_anomaly_count_default_zero}'
|
||||
${var.subscription_push_latency_anomaly_seasonality == "agile" ? format(",seasonality='%s'", var.subscription_push_latency_anomaly_seasonality) : ""}
|
||||
)
|
||||
|
||||
>= ${var.subscription_push_latency_anomaly_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.subscription_push_latency_anomaly_threshold_warning
|
||||
critical = var.subscription_push_latency_anomaly_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:subscription", "team:claranet", "created-by:terraform"], var.subscription_push_latency_anomaly_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
15
cloud/gcp/pubsub/subscription/outputs.tf
Normal file
15
cloud/gcp/pubsub/subscription/outputs.tf
Normal file
@ -0,0 +1,15 @@
|
||||
output "oldest_unacked_message_age_id" {
|
||||
description = "id for monitor oldest_unacked_message_age"
|
||||
value = datadog_monitor.oldest_unacked_message_age.*.id
|
||||
}
|
||||
|
||||
output "subscription_push_latency_id" {
|
||||
description = "id for monitor subscription_push_latency"
|
||||
value = datadog_monitor.subscription_push_latency.*.id
|
||||
}
|
||||
|
||||
output "subscription_push_latency_anomaly_id" {
|
||||
description = "id for monitor subscription_push_latency_anomaly"
|
||||
value = datadog_monitor.subscription_push_latency_anomaly.*.id
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
# CLOUD GCP PUBSUB DataDog monitors
|
||||
# CLOUD GCP PUBSUB TOPIC DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-gcp-pubsub" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub?ref={revision}"
|
||||
module "datadog-monitors-cloud-gcp-pubsub-topic" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/gcp/pubsub/topic?ref={revision}"
|
||||
|
||||
environment = var.environment
|
||||
message = module.datadog-message-alerting.alerting-message
|
||||
@ -16,8 +16,9 @@ module "datadog-monitors-cloud-gcp-pubsub" {
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- GCP pubsub sending messages operations
|
||||
- GCP pubsub sending messages with result unavailable
|
||||
- GCP Pub/Sub Topic ratio of sending messages with result unavailable
|
||||
- GCP Pub/Sub Topic sending messages operations
|
||||
- GCP Pub/Sub Topic sending messages with result unavailable
|
||||
|
||||
## Inputs
|
||||
|
||||
@ -42,6 +43,13 @@ Creates DataDog monitors with the following checks:
|
||||
| unavailable\_sending\_operations\_count\_threshold\_warning | Warning threshold for the number of unavailable sending operations | string | `"2"` | no |
|
||||
| unavailable\_sending\_operations\_count\_time\_aggregator | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"sum"` | no |
|
||||
| unavailable\_sending\_operations\_count\_timeframe | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Count monitor | string | `"last_10m"` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_enabled | Flag to enable GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"true"` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_extra\_tags | Extra tags for GCP Pub/Sub Unavailable Sending Operations Ratio monitor | list(string) | `[]` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_message | Custom message for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `""` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_threshold\_critical | Critical threshold (%) for the ratio of unavailable sending operations | string | `"20"` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_threshold\_warning | Warning threshold (%) for the ratio of unavailable sending operations | string | `"10"` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_time\_aggregator | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"sum"` | no |
|
||||
| unavailable\_sending\_operations\_ratio\_timeframe | Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor | string | `"last_10m"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
@ -49,6 +57,7 @@ Creates DataDog monitors with the following checks:
|
||||
|------|-------------|
|
||||
| sending\_operations\_count\_id | id for monitor sending_operations_count |
|
||||
| unavailable\_sending\_operations\_count\_id | id for monitor unavailable_sending_operations_count |
|
||||
| unavailable\_sending\_operations\_ratio\_id | id for monitor unavailable_sending_operations_ratio |
|
||||
|
||||
## Related documentation
|
||||
|
||||
@ -114,3 +114,47 @@ variable "unavailable_sending_operations_count_extra_tags" {
|
||||
default = []
|
||||
}
|
||||
|
||||
#
|
||||
# Unavailable Sending Operations Ratio
|
||||
#
|
||||
variable "unavailable_sending_operations_ratio_message" {
|
||||
description = "Custom message for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_time_aggregator" {
|
||||
description = "Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_timeframe" {
|
||||
description = "Timeframe for the GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
|
||||
type = string
|
||||
default = "last_10m"
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_threshold_warning" {
|
||||
description = "Warning threshold (%) for the ratio of unavailable sending operations"
|
||||
type = string
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_threshold_critical" {
|
||||
description = "Critical threshold (%) for the ratio of unavailable sending operations"
|
||||
type = string
|
||||
default = 20
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_enabled" {
|
||||
description = "Flag to enable GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "unavailable_sending_operations_ratio_extra_tags" {
|
||||
description = "Extra tags for GCP Pub/Sub Unavailable Sending Operations Ratio monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
111
cloud/gcp/pubsub/topic/monitors-topics.tf
Normal file
111
cloud/gcp/pubsub/topic/monitors-topics.tf
Normal file
@ -0,0 +1,111 @@
|
||||
#
|
||||
# Sending Operations Count
|
||||
#
|
||||
resource "datadog_monitor" "sending_operations_count" {
|
||||
count = var.sending_operations_count_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Topic sending messages operations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.sending_operations_count_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.sending_operations_count_time_aggregator}(${var.sending_operations_count_timeframe}):
|
||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
|
||||
<= ${var.sending_operations_count_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
critical = var.sending_operations_count_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = true
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:topic", "team:claranet", "created-by:terraform"], var.sending_operations_count_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Unavailable Sending Operations Count
|
||||
#
|
||||
resource "datadog_monitor" "unavailable_sending_operations_count" {
|
||||
count = var.unavailable_sending_operations_count_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Topic sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.unavailable_sending_operations_count_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.unavailable_sending_operations_count_time_aggregator}(${var.unavailable_sending_operations_count_timeframe}):
|
||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0)
|
||||
>= ${var.unavailable_sending_operations_count_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.unavailable_sending_operations_count_threshold_warning
|
||||
critical = var.unavailable_sending_operations_count_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:topic", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_count_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Unavailable Sending Operations Ratio
|
||||
#
|
||||
resource "datadog_monitor" "unavailable_sending_operations_ratio" {
|
||||
count = var.unavailable_sending_operations_ratio_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] GCP Pub/Sub Topic ratio of sending messages with result unavailable {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
message = coalesce(var.unavailable_sending_operations_ratio_message, var.message)
|
||||
type = "query alert"
|
||||
|
||||
query = <<EOQ
|
||||
${var.unavailable_sending_operations_ratio_time_aggregator}(${var.unavailable_sending_operations_ratio_timeframe}):
|
||||
(100 * default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags},response_code:unavailable} by {topic_id}.as_count(), 0))
|
||||
/
|
||||
default(avg:gcp.pubsub.topic.send_message_operation_count{${var.filter_tags}} by {topic_id}.as_count(), 0)
|
||||
>= ${var.unavailable_sending_operations_ratio_threshold_critical}
|
||||
EOQ
|
||||
|
||||
thresholds = {
|
||||
warning = var.unavailable_sending_operations_ratio_threshold_warning
|
||||
critical = var.unavailable_sending_operations_ratio_threshold_critical
|
||||
}
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
notify_audit = false
|
||||
locked = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
require_full_window = false
|
||||
notify_no_data = false
|
||||
renotify_interval = 0
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:gcp", "resource:pubsub", "pubsub_category:topic", "team:claranet", "created-by:terraform"], var.unavailable_sending_operations_ratio_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
@ -8,3 +8,8 @@ output "unavailable_sending_operations_count_id" {
|
||||
value = datadog_monitor.unavailable_sending_operations_count.*.id
|
||||
}
|
||||
|
||||
output "unavailable_sending_operations_ratio_id" {
|
||||
description = "id for monitor unavailable_sending_operations_ratio"
|
||||
value = datadog_monitor.unavailable_sending_operations_ratio.*.id
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user