MON-46 Monitors on number of errors, percentage of errors, number of invocations and number of throttles. No date enabled for the one on invocations.
This commit is contained in:
parent
2b8124fa4f
commit
a0f0c43776
211
cloud/aws/lambda/inputs.tf
Normal file
211
cloud/aws/lambda/inputs.tf
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
# Datadog global variables
|
||||||
|
|
||||||
|
variable "environment" {
|
||||||
|
description = "Architecture environment"
|
||||||
|
type = string
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_use_defaults" {
|
||||||
|
description = "Use default filter tags convention"
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom" {
|
||||||
|
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "filter_tags_custom_excluded" {
|
||||||
|
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "message" {
|
||||||
|
description = "Message sent when a monitor is triggered"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "evaluation_delay" {
|
||||||
|
description = "Delay in seconds for the metric evaluation"
|
||||||
|
default = 900
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "new_host_delay" {
|
||||||
|
description = "Delay in seconds before monitor new resource"
|
||||||
|
default = 300
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "prefix_slug" {
|
||||||
|
description = "Prefix string to prepend between brackets on every monitors names"
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Datadog monitors variables
|
||||||
|
|
||||||
|
# Percentage of errors
|
||||||
|
variable "pct_errors_enabled" {
|
||||||
|
description = "Flag to enable Percentage of errors monitor"
|
||||||
|
type = string
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_extra_tags" {
|
||||||
|
description = "Extra tags for Percentage of errors monitor"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_message" {
|
||||||
|
description = "Custom message for Percentage of errors monitor"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Percentage of errors [available values: min, max or avg]"
|
||||||
|
type = string
|
||||||
|
default = "sum"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_timeframe" {
|
||||||
|
description = "Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
|
type = string
|
||||||
|
default = "last_1h"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_threshold_critical" {
|
||||||
|
default = 30
|
||||||
|
description = "Alerting threshold in percentage"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pct_errors_threshold_warning" {
|
||||||
|
default = 20
|
||||||
|
description = "Warning threshold in percentage"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Errors count
|
||||||
|
variable "errors_enabled" {
|
||||||
|
description = "Flag to enable Errors monitor"
|
||||||
|
type = string
|
||||||
|
default = "false"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_extra_tags" {
|
||||||
|
description = "Extra tags for Errors monitor"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_message" {
|
||||||
|
description = "Custom message for Errors monitor"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Errors [available values: min, max or avg]"
|
||||||
|
type = string
|
||||||
|
default = "sum"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_timeframe" {
|
||||||
|
description = "Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
|
type = string
|
||||||
|
default = "last_1h"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_threshold_critical" {
|
||||||
|
default = 3
|
||||||
|
description = "Alerting threshold in milliseconds"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "errors_threshold_warning" {
|
||||||
|
default = 1
|
||||||
|
description = "Warning threshold in milliseconds"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Throttles count
|
||||||
|
variable "throttles_enabled" {
|
||||||
|
description = "Flag to enable Throttles monitor"
|
||||||
|
type = string
|
||||||
|
default = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_extra_tags" {
|
||||||
|
description = "Extra tags for Throttles monitor"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_message" {
|
||||||
|
description = "Custom message for Throttles monitor"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Throttles [available values: min, max or avg]"
|
||||||
|
type = string
|
||||||
|
default = "sum"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_timeframe" {
|
||||||
|
description = "Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
|
type = string
|
||||||
|
default = "last_1h"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_threshold_critical" {
|
||||||
|
default = 3
|
||||||
|
description = "Alerting threshold in number of throttles"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "throttles_threshold_warning" {
|
||||||
|
default = 1
|
||||||
|
description = "Warning threshold in number of throttles"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Invocations
|
||||||
|
variable "invocations_enabled" {
|
||||||
|
description = "Flag to enable Invocations monitor"
|
||||||
|
type = string
|
||||||
|
default = "false"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_extra_tags" {
|
||||||
|
description = "Extra tags for Invocations monitor"
|
||||||
|
type = list(string)
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_message" {
|
||||||
|
description = "Custom message for Invocations monitor"
|
||||||
|
type = string
|
||||||
|
default = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_time_aggregator" {
|
||||||
|
description = "Monitor aggregator for Invocations [available values: min, max or avg]"
|
||||||
|
type = string
|
||||||
|
default = "sum"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_timeframe" {
|
||||||
|
description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||||
|
type = string
|
||||||
|
default = "last_1h"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_threshold_critical" {
|
||||||
|
default = 1
|
||||||
|
description = "Alerting threshold in number of invocations"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_threshold_warning" {
|
||||||
|
default = 2
|
||||||
|
description = "Warning threshold in number of invocations"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "invocations_no_data_timeframe" {
|
||||||
|
default = 120
|
||||||
|
description = "Timeframe to check before alerting on no data in minutes"
|
||||||
|
}
|
||||||
9
cloud/aws/lambda/modules.tf
Normal file
9
cloud/aws/lambda/modules.tf
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
module "filter-tags" {
|
||||||
|
source = "../../../common/filter-tags"
|
||||||
|
|
||||||
|
environment = var.environment
|
||||||
|
resource = "aws_lambda"
|
||||||
|
filter_tags_use_defaults = var.filter_tags_use_defaults
|
||||||
|
filter_tags_custom = var.filter_tags_custom
|
||||||
|
filter_tags_custom_excluded = var.filter_tags_custom_excluded
|
||||||
|
}
|
||||||
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
# Errors Percent
|
||||||
|
resource "datadog_monitor" "pct_errors" {
|
||||||
|
count = var.pct_errors_enabled == "true" ? 1 : 0
|
||||||
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Percentage of errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = coalesce(var.pct_errors_message, var.message)
|
||||||
|
|
||||||
|
query = <<EOQ
|
||||||
|
${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}):
|
||||||
|
(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||||
|
/
|
||||||
|
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count())
|
||||||
|
* 100
|
||||||
|
> ${var.pct_errors_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
|
evaluation_delay = var.evaluation_delay
|
||||||
|
new_host_delay = var.new_host_delay
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
critical = var.pct_errors_threshold_critical
|
||||||
|
warning = var.pct_errors_threshold_warning
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
require_full_window = false
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
|
||||||
|
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.pct_errors_extra_tags)
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = ["silenced"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Errors Absolute Value
|
||||||
|
resource "datadog_monitor" "errors" {
|
||||||
|
count = var.errors_enabled == "true" ? 1 : 0
|
||||||
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = coalesce(var.errors_message, var.message)
|
||||||
|
|
||||||
|
query = <<EOQ
|
||||||
|
${var.errors_time_aggregator}(${var.errors_timeframe}):
|
||||||
|
sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||||
|
> ${var.errors_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
|
evaluation_delay = var.evaluation_delay
|
||||||
|
new_host_delay = var.new_host_delay
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
critical = var.errors_threshold_critical
|
||||||
|
warning = var.errors_threshold_warning
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
require_full_window = false
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
|
||||||
|
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.errors_extra_tags)
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = ["silenced"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Throttles
|
||||||
|
resource "datadog_monitor" "throttles" {
|
||||||
|
count = var.throttles_enabled == "true" ? 1 : 0
|
||||||
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Throttles {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = coalesce(var.throttles_message, var.message)
|
||||||
|
|
||||||
|
query = <<EOQ
|
||||||
|
${var.throttles_time_aggregator}(${var.throttles_timeframe}):
|
||||||
|
sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||||
|
> ${var.throttles_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
|
evaluation_delay = var.evaluation_delay
|
||||||
|
new_host_delay = var.new_host_delay
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
critical = var.throttles_threshold_critical
|
||||||
|
warning = var.throttles_threshold_warning
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = false
|
||||||
|
require_full_window = false
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
|
||||||
|
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.throttles_extra_tags)
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = ["silenced"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# INVOCATIONS
|
||||||
|
resource "datadog_monitor" "invocations" {
|
||||||
|
count = var.invocations_enabled == "true" ? 1 : 0
|
||||||
|
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||||
|
type = "metric alert"
|
||||||
|
message = coalesce(var.invocations_message, var.message)
|
||||||
|
|
||||||
|
query = <<EOQ
|
||||||
|
${var.invocations_time_aggregator}(${var.invocations_timeframe}):
|
||||||
|
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count()
|
||||||
|
< ${var.invocations_threshold_critical}
|
||||||
|
EOQ
|
||||||
|
|
||||||
|
evaluation_delay = var.evaluation_delay
|
||||||
|
new_host_delay = var.new_host_delay
|
||||||
|
|
||||||
|
thresholds = {
|
||||||
|
critical = var.invocations_threshold_critical
|
||||||
|
warning = var.invocations_threshold_warning
|
||||||
|
}
|
||||||
|
|
||||||
|
notify_no_data = true
|
||||||
|
no_data_timeframe = var.invocations_no_data_timeframe
|
||||||
|
require_full_window = false
|
||||||
|
renotify_interval = 0
|
||||||
|
notify_audit = false
|
||||||
|
timeout_h = 0
|
||||||
|
include_tags = true
|
||||||
|
locked = false
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.invocations_extra_tags)
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = ["silenced"]
|
||||||
|
}
|
||||||
|
}
|
||||||
20
cloud/aws/lambda/outputs.tf
Normal file
20
cloud/aws/lambda/outputs.tf
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
output "errors_id" {
|
||||||
|
description = "id for monitor errors"
|
||||||
|
value = datadog_monitor.errors.*.id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "invocations_id" {
|
||||||
|
description = "id for monitor invocations"
|
||||||
|
value = datadog_monitor.invocations.*.id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "pct_errors_id" {
|
||||||
|
description = "id for monitor pct_errors"
|
||||||
|
value = datadog_monitor.pct_errors.*.id
|
||||||
|
}
|
||||||
|
|
||||||
|
output "throttles_id" {
|
||||||
|
description = "id for monitor throttles"
|
||||||
|
value = datadog_monitor.throttles.*.id
|
||||||
|
}
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user