MON-46 Monitors on number of errors, percentage of errors, number of invocations and number of throttles. No date enabled for the one on invocations.
This commit is contained in:
parent
2b8124fa4f
commit
a0f0c43776
211
cloud/aws/lambda/inputs.tf
Normal file
211
cloud/aws/lambda/inputs.tf
Normal file
@ -0,0 +1,211 @@
|
||||
# Datadog global variables
|
||||
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom_excluded" {
|
||||
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds before monitor new resource"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "prefix_slug" {
|
||||
description = "Prefix string to prepend between brackets on every monitors names"
|
||||
default = ""
|
||||
}
|
||||
|
||||
# Datadog monitors variables
|
||||
|
||||
# Percentage of errors
|
||||
variable "pct_errors_enabled" {
|
||||
description = "Flag to enable Percentage of errors monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "pct_errors_extra_tags" {
|
||||
description = "Extra tags for Percentage of errors monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "pct_errors_message" {
|
||||
description = "Custom message for Percentage of errors monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "pct_errors_time_aggregator" {
|
||||
description = "Monitor aggregator for Percentage of errors [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "pct_errors_timeframe" {
|
||||
description = "Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "pct_errors_threshold_critical" {
|
||||
default = 30
|
||||
description = "Alerting threshold in percentage"
|
||||
}
|
||||
|
||||
variable "pct_errors_threshold_warning" {
|
||||
default = 20
|
||||
description = "Warning threshold in percentage"
|
||||
}
|
||||
|
||||
# Errors count
|
||||
variable "errors_enabled" {
|
||||
description = "Flag to enable Errors monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "errors_extra_tags" {
|
||||
description = "Extra tags for Errors monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "errors_message" {
|
||||
description = "Custom message for Errors monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "errors_time_aggregator" {
|
||||
description = "Monitor aggregator for Errors [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "errors_timeframe" {
|
||||
description = "Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "errors_threshold_critical" {
|
||||
default = 3
|
||||
description = "Alerting threshold in milliseconds"
|
||||
}
|
||||
|
||||
variable "errors_threshold_warning" {
|
||||
default = 1
|
||||
description = "Warning threshold in milliseconds"
|
||||
}
|
||||
|
||||
# Throttles count
|
||||
variable "throttles_enabled" {
|
||||
description = "Flag to enable Throttles monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "throttles_extra_tags" {
|
||||
description = "Extra tags for Throttles monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "throttles_message" {
|
||||
description = "Custom message for Throttles monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "throttles_time_aggregator" {
|
||||
description = "Monitor aggregator for Throttles [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "throttles_timeframe" {
|
||||
description = "Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "throttles_threshold_critical" {
|
||||
default = 3
|
||||
description = "Alerting threshold in number of throttles"
|
||||
}
|
||||
|
||||
variable "throttles_threshold_warning" {
|
||||
default = 1
|
||||
description = "Warning threshold in number of throttles"
|
||||
}
|
||||
|
||||
# Invocations
|
||||
variable "invocations_enabled" {
|
||||
description = "Flag to enable Invocations monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "invocations_extra_tags" {
|
||||
description = "Extra tags for Invocations monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "invocations_message" {
|
||||
description = "Custom message for Invocations monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "invocations_time_aggregator" {
|
||||
description = "Monitor aggregator for Invocations [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "invocations_timeframe" {
|
||||
description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "invocations_threshold_critical" {
|
||||
default = 1
|
||||
description = "Alerting threshold in number of invocations"
|
||||
}
|
||||
|
||||
variable "invocations_threshold_warning" {
|
||||
default = 2
|
||||
description = "Warning threshold in number of invocations"
|
||||
}
|
||||
|
||||
variable "invocations_no_data_timeframe" {
|
||||
default = 120
|
||||
description = "Timeframe to check before alerting on no data in minutes"
|
||||
}
|
||||
9
cloud/aws/lambda/modules.tf
Normal file
9
cloud/aws/lambda/modules.tf
Normal file
@ -0,0 +1,9 @@
|
||||
module "filter-tags" {
|
||||
source = "../../../common/filter-tags"
|
||||
|
||||
environment = var.environment
|
||||
resource = "aws_lambda"
|
||||
filter_tags_use_defaults = var.filter_tags_use_defaults
|
||||
filter_tags_custom = var.filter_tags_custom
|
||||
filter_tags_custom_excluded = var.filter_tags_custom_excluded
|
||||
}
|
||||
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
@ -0,0 +1,149 @@
|
||||
# Errors Percent
|
||||
resource "datadog_monitor" "pct_errors" {
|
||||
count = var.pct_errors_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Percentage of errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.pct_errors_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}):
|
||||
(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||
/
|
||||
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count())
|
||||
* 100
|
||||
> ${var.pct_errors_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.pct_errors_threshold_critical
|
||||
warning = var.pct_errors_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.pct_errors_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# Errors Absolute Value
|
||||
resource "datadog_monitor" "errors" {
|
||||
count = var.errors_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.errors_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.errors_time_aggregator}(${var.errors_timeframe}):
|
||||
sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||
> ${var.errors_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.errors_threshold_critical
|
||||
warning = var.errors_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.errors_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# Throttles
|
||||
resource "datadog_monitor" "throttles" {
|
||||
count = var.throttles_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Throttles {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.throttles_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.throttles_time_aggregator}(${var.throttles_timeframe}):
|
||||
sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||
> ${var.throttles_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.throttles_threshold_critical
|
||||
warning = var.throttles_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.throttles_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# INVOCATIONS
|
||||
resource "datadog_monitor" "invocations" {
|
||||
count = var.invocations_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.invocations_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.invocations_time_aggregator}(${var.invocations_timeframe}):
|
||||
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||
< ${var.invocations_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.invocations_threshold_critical
|
||||
warning = var.invocations_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = true
|
||||
no_data_timeframe = var.invocations_no_data_timeframe
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.invocations_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
20
cloud/aws/lambda/outputs.tf
Normal file
20
cloud/aws/lambda/outputs.tf
Normal file
@ -0,0 +1,20 @@
|
||||
output "errors_id" {
|
||||
description = "id for monitor errors"
|
||||
value = datadog_monitor.errors.*.id
|
||||
}
|
||||
|
||||
output "invocations_id" {
|
||||
description = "id for monitor invocations"
|
||||
value = datadog_monitor.invocations.*.id
|
||||
}
|
||||
|
||||
output "pct_errors_id" {
|
||||
description = "id for monitor pct_errors"
|
||||
value = datadog_monitor.pct_errors.*.id
|
||||
}
|
||||
|
||||
output "throttles_id" {
|
||||
description = "id for monitor throttles"
|
||||
value = datadog_monitor.throttles.*.id
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user