MON-46 Add default on all queries. Remove no data from invocations. Improve naming.

This commit is contained in:
Rafael Romero Carmona 2019-09-06 11:34:46 +01:00
parent 4d39f320f7
commit f2cf760628
3 changed files with 18 additions and 18 deletions

View File

@ -16,9 +16,9 @@ module "datadog-monitors-cloud-aws-lambda" {
Creates DataDog monitors with the following checks: Creates DataDog monitors with the following checks:
- Lambda Number of Errors - Lambda Invocations throttled due to concurrent limit reached
- Lambda Number of Invocations (disabled by default) - Lambda Number of errors
- Lambda Number of Throttles - Lambda Number of invocations (disabled by default)
- Lambda Percentage of errors - Lambda Percentage of errors
## Inputs ## Inputs
@ -44,7 +44,7 @@ Creates DataDog monitors with the following checks:
| invocations\_threshold\_critical | Alerting threshold in number of invocations | string | `"1"` | no | | invocations\_threshold\_critical | Alerting threshold in number of invocations | string | `"1"` | no |
| invocations\_threshold\_warning | Warning threshold in number of invocations | string | `"2"` | no | | invocations\_threshold\_warning | Warning threshold in number of invocations | string | `"2"` | no |
| invocations\_time\_aggregator | Monitor aggregator for Invocations [available values: min, max or avg] | string | `"sum"` | no | | invocations\_time\_aggregator | Monitor aggregator for Invocations [available values: min, max or avg] | string | `"sum"` | no |
| invocations\_timeframe | Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no | | invocations\_timeframe | Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_30m"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes | | message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | | new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| pct\_errors\_enabled | Flag to enable Percentage of errors monitor | string | `"true"` | no | | pct\_errors\_enabled | Flag to enable Percentage of errors monitor | string | `"true"` | no |

View File

@ -192,7 +192,7 @@ variable "invocations_time_aggregator" {
variable "invocations_timeframe" { variable "invocations_timeframe" {
description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string type = string
default = "last_1h" default = "last_30m"
} }
variable "invocations_threshold_critical" { variable "invocations_threshold_critical" {

View File

@ -7,10 +7,11 @@ resource "datadog_monitor" "pct_errors" {
query = <<EOQ query = <<EOQ
${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}): ${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}):
(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count() default(
/ (default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count()) /
* 100 default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),1))
* 100,0)
> ${var.pct_errors_threshold_critical} > ${var.pct_errors_threshold_critical}
EOQ EOQ
@ -40,13 +41,13 @@ resource "datadog_monitor" "pct_errors" {
# Errors Absolute Value # Errors Absolute Value
resource "datadog_monitor" "errors" { resource "datadog_monitor" "errors" {
count = var.errors_enabled == "true" ? 1 : 0 count = var.errors_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = coalesce(var.errors_message, var.message) message = coalesce(var.errors_message, var.message)
query = <<EOQ query = <<EOQ
${var.errors_time_aggregator}(${var.errors_timeframe}): ${var.errors_time_aggregator}(${var.errors_timeframe}):
sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count() default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
> ${var.errors_threshold_critical} > ${var.errors_threshold_critical}
EOQ EOQ
@ -76,13 +77,13 @@ resource "datadog_monitor" "errors" {
# Throttles # Throttles
resource "datadog_monitor" "throttles" { resource "datadog_monitor" "throttles" {
count = var.throttles_enabled == "true" ? 1 : 0 count = var.throttles_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Throttles {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Invocations throttled due to concurrent limit reached {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = coalesce(var.throttles_message, var.message) message = coalesce(var.throttles_message, var.message)
query = <<EOQ query = <<EOQ
${var.throttles_time_aggregator}(${var.throttles_timeframe}): ${var.throttles_time_aggregator}(${var.throttles_timeframe}):
sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count() default(sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
> ${var.throttles_threshold_critical} > ${var.throttles_threshold_critical}
EOQ EOQ
@ -112,14 +113,14 @@ resource "datadog_monitor" "throttles" {
# INVOCATIONS # INVOCATIONS
resource "datadog_monitor" "invocations" { resource "datadog_monitor" "invocations" {
count = var.invocations_enabled == "true" ? 1 : 0 count = var.invocations_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of Invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert" type = "metric alert"
message = coalesce(var.invocations_message, var.message) message = coalesce(var.invocations_message, var.message)
query = <<EOQ query = <<EOQ
${var.invocations_time_aggregator}(${var.invocations_timeframe}): ${var.invocations_time_aggregator}(${var.invocations_timeframe}):
sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count() default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
< ${var.invocations_threshold_critical} <= ${var.invocations_threshold_critical}
EOQ EOQ
evaluation_delay = var.evaluation_delay evaluation_delay = var.evaluation_delay
@ -130,8 +131,7 @@ resource "datadog_monitor" "invocations" {
warning = var.invocations_threshold_warning warning = var.invocations_threshold_warning
} }
notify_no_data = true notify_no_data = false
no_data_timeframe = var.invocations_no_data_timeframe
require_full_window = false require_full_window = false
renotify_interval = 0 renotify_interval = 0
notify_audit = false notify_audit = false