Merge branch 'MON-46-aws-lambda-tf012' into 'master'

MON-46 AWS Lambda

See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!109
This commit is contained in:
Quentin Manfroi 2019-09-06 17:37:29 +02:00
commit 326474a374
6 changed files with 467 additions and 0 deletions

View File

@ -147,6 +147,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [elasticsearch](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticsearch/)
- [elb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elb/)
- [kinesis-firehose](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/kinesis-firehose/)
- [lambda](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/lambda/)
- [rds](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/)
- [aurora](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/aurora/)
- [mysql](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/aurora/mysql/)

View File

@ -0,0 +1,77 @@
# CLOUD AWS LAMBDA DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-aws-lambda" {
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/lambda?ref={revision}"
environment = var.environment
message = module.datadog-message-alerting.alerting-message
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Lambda Invocations throttled due to concurrent limit reached
- Lambda Number of errors
- Lambda Number of invocations (disabled by default)
- Lambda Percentage of errors
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes |
| errors\_enabled | Flag to enable Errors monitor | string | `"false"` | no |
| errors\_extra\_tags | Extra tags for Errors monitor | list(string) | `[]` | no |
| errors\_message | Custom message for Errors monitor | string | `""` | no |
| errors\_threshold\_critical | Alerting threshold in milliseconds | string | `"3"` | no |
| errors\_threshold\_warning | Warning threshold in milliseconds | string | `"1"` | no |
| errors\_time\_aggregator | Monitor aggregator for Errors [available values: min, max or avg] | string | `"sum"` | no |
| errors\_timeframe | Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| invocations\_enabled | Flag to enable Invocations monitor | string | `"false"` | no |
| invocations\_extra\_tags | Extra tags for Invocations monitor | list(string) | `[]` | no |
| invocations\_message | Custom message for Invocations monitor | string | `""` | no |
| invocations\_no\_data\_timeframe | Timeframe to check before alerting on no data in minutes | string | `"120"` | no |
| invocations\_threshold\_critical | Alerting threshold in number of invocations | string | `"1"` | no |
| invocations\_threshold\_warning | Warning threshold in number of invocations | string | `"2"` | no |
| invocations\_time\_aggregator | Monitor aggregator for Invocations [available values: min, max or avg] | string | `"sum"` | no |
| invocations\_timeframe | Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_30m"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
| pct\_errors\_enabled | Flag to enable Percentage of errors monitor | string | `"true"` | no |
| pct\_errors\_extra\_tags | Extra tags for Percentage of errors monitor | list(string) | `[]` | no |
| pct\_errors\_message | Custom message for Percentage of errors monitor | string | `""` | no |
| pct\_errors\_threshold\_critical | Alerting threshold in percentage | string | `"30"` | no |
| pct\_errors\_threshold\_warning | Warning threshold in percentage | string | `"20"` | no |
| pct\_errors\_time\_aggregator | Monitor aggregator for Percentage of errors [available values: min, max or avg] | string | `"sum"` | no |
| pct\_errors\_timeframe | Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
| throttles\_enabled | Flag to enable Throttles monitor | string | `"true"` | no |
| throttles\_extra\_tags | Extra tags for Throttles monitor | list(string) | `[]` | no |
| throttles\_message | Custom message for Throttles monitor | string | `""` | no |
| throttles\_threshold\_critical | Alerting threshold in number of throttles | string | `"3"` | no |
| throttles\_threshold\_warning | Warning threshold in number of throttles | string | `"1"` | no |
| throttles\_time\_aggregator | Monitor aggregator for Throttles [available values: min, max or avg] | string | `"sum"` | no |
| throttles\_timeframe | Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
## Outputs
| Name | Description |
|------|-------------|
| errors\_id | id for monitor errors |
| invocations\_id | id for monitor invocations |
| pct\_errors\_id | id for monitor pct_errors |
| throttles\_id | id for monitor throttles |
## Related documentation
* [Datadog Documentation](https://docs.datadoghq.com/integrations/amazon_lambda/)
* [Service documentation](https://docs.aws.amazon.com/lambda/index.html)

211
cloud/aws/lambda/inputs.tf Normal file
View File

@ -0,0 +1,211 @@
# Datadog global variables
variable "environment" {
description = "Architecture environment"
type = string
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "prefix_slug" {
description = "Prefix string to prepend between brackets on every monitors names"
default = ""
}
# Datadog monitors variables
# Percentage of errors
variable "pct_errors_enabled" {
description = "Flag to enable Percentage of errors monitor"
type = string
default = "true"
}
variable "pct_errors_extra_tags" {
description = "Extra tags for Percentage of errors monitor"
type = list(string)
default = []
}
variable "pct_errors_message" {
description = "Custom message for Percentage of errors monitor"
type = string
default = ""
}
variable "pct_errors_time_aggregator" {
description = "Monitor aggregator for Percentage of errors [available values: min, max or avg]"
type = string
default = "sum"
}
variable "pct_errors_timeframe" {
description = "Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_1h"
}
variable "pct_errors_threshold_critical" {
default = 30
description = "Alerting threshold in percentage"
}
variable "pct_errors_threshold_warning" {
default = 20
description = "Warning threshold in percentage"
}
# Errors count
variable "errors_enabled" {
description = "Flag to enable Errors monitor"
type = string
default = "false"
}
variable "errors_extra_tags" {
description = "Extra tags for Errors monitor"
type = list(string)
default = []
}
variable "errors_message" {
description = "Custom message for Errors monitor"
type = string
default = ""
}
variable "errors_time_aggregator" {
description = "Monitor aggregator for Errors [available values: min, max or avg]"
type = string
default = "sum"
}
variable "errors_timeframe" {
description = "Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_1h"
}
variable "errors_threshold_critical" {
default = 3
description = "Alerting threshold in milliseconds"
}
variable "errors_threshold_warning" {
default = 1
description = "Warning threshold in milliseconds"
}
# Throttles count
variable "throttles_enabled" {
description = "Flag to enable Throttles monitor"
type = string
default = "true"
}
variable "throttles_extra_tags" {
description = "Extra tags for Throttles monitor"
type = list(string)
default = []
}
variable "throttles_message" {
description = "Custom message for Throttles monitor"
type = string
default = ""
}
variable "throttles_time_aggregator" {
description = "Monitor aggregator for Throttles [available values: min, max or avg]"
type = string
default = "sum"
}
variable "throttles_timeframe" {
description = "Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_1h"
}
variable "throttles_threshold_critical" {
default = 3
description = "Alerting threshold in number of throttles"
}
variable "throttles_threshold_warning" {
default = 1
description = "Warning threshold in number of throttles"
}
# Invocations
variable "invocations_enabled" {
description = "Flag to enable Invocations monitor"
type = string
default = "false"
}
variable "invocations_extra_tags" {
description = "Extra tags for Invocations monitor"
type = list(string)
default = []
}
variable "invocations_message" {
description = "Custom message for Invocations monitor"
type = string
default = ""
}
variable "invocations_time_aggregator" {
description = "Monitor aggregator for Invocations [available values: min, max or avg]"
type = string
default = "sum"
}
variable "invocations_timeframe" {
description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = string
default = "last_30m"
}
variable "invocations_threshold_critical" {
default = 1
description = "Alerting threshold in number of invocations"
}
variable "invocations_threshold_warning" {
default = 2
description = "Warning threshold in number of invocations"
}
variable "invocations_no_data_timeframe" {
default = 120
description = "Timeframe to check before alerting on no data in minutes"
}

View File

@ -0,0 +1,9 @@
module "filter-tags" {
source = "../../../common/filter-tags"
environment = var.environment
resource = "aws_lambda"
filter_tags_use_defaults = var.filter_tags_use_defaults
filter_tags_custom = var.filter_tags_custom
filter_tags_custom_excluded = var.filter_tags_custom_excluded
}

View File

@ -0,0 +1,149 @@
# Errors Percent
resource "datadog_monitor" "pct_errors" {
count = var.pct_errors_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Percentage of errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = coalesce(var.pct_errors_message, var.message)
query = <<EOQ
${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}):
default(
(default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
/
default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),1))
* 100,0)
> ${var.pct_errors_threshold_critical}
EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
thresholds = {
critical = var.pct_errors_threshold_critical
warning = var.pct_errors_threshold_warning
}
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.pct_errors_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
# Errors Absolute Value
resource "datadog_monitor" "errors" {
count = var.errors_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert"
message = coalesce(var.errors_message, var.message)
query = <<EOQ
${var.errors_time_aggregator}(${var.errors_timeframe}):
default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
> ${var.errors_threshold_critical}
EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
thresholds = {
critical = var.errors_threshold_critical
warning = var.errors_threshold_warning
}
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.errors_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
# Throttles
resource "datadog_monitor" "throttles" {
count = var.throttles_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Invocations throttled due to concurrent limit reached {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert"
message = coalesce(var.throttles_message, var.message)
query = <<EOQ
${var.throttles_time_aggregator}(${var.throttles_timeframe}):
default(sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
> ${var.throttles_threshold_critical}
EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
thresholds = {
critical = var.throttles_threshold_critical
warning = var.throttles_threshold_warning
}
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.throttles_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}
# INVOCATIONS
resource "datadog_monitor" "invocations" {
count = var.invocations_enabled == "true" ? 1 : 0
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert"
message = coalesce(var.invocations_message, var.message)
query = <<EOQ
${var.invocations_time_aggregator}(${var.invocations_timeframe}):
default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
<= ${var.invocations_threshold_critical}
EOQ
evaluation_delay = var.evaluation_delay
new_host_delay = var.new_host_delay
thresholds = {
critical = var.invocations_threshold_critical
warning = var.invocations_threshold_warning
}
notify_no_data = false
require_full_window = false
renotify_interval = 0
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.invocations_extra_tags)
lifecycle {
ignore_changes = ["silenced"]
}
}

View File

@ -0,0 +1,20 @@
output "errors_id" {
description = "id for monitor errors"
value = datadog_monitor.errors.*.id
}
output "invocations_id" {
description = "id for monitor invocations"
value = datadog_monitor.invocations.*.id
}
output "pct_errors_id" {
description = "id for monitor pct_errors"
value = datadog_monitor.pct_errors.*.id
}
output "throttles_id" {
description = "id for monitor throttles"
value = datadog_monitor.throttles.*.id
}