Merge branch 'MON-46-aws-lambda-tf012' into 'master'
MON-46 AWS Lambda See merge request claranet/pt-monitoring/projects/datadog/terraform/monitors!109
This commit is contained in:
commit
326474a374
@ -147,6 +147,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
|
||||
- [elasticsearch](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elasticsearch/)
|
||||
- [elb](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/elb/)
|
||||
- [kinesis-firehose](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/kinesis-firehose/)
|
||||
- [lambda](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/lambda/)
|
||||
- [rds](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/)
|
||||
- [aurora](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/aurora/)
|
||||
- [mysql](https://git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors/tree/master/cloud/aws/rds/aurora/mysql/)
|
||||
|
||||
77
cloud/aws/lambda/README.md
Normal file
77
cloud/aws/lambda/README.md
Normal file
@ -0,0 +1,77 @@
|
||||
# CLOUD AWS LAMBDA DataDog monitors
|
||||
|
||||
## How to use this module
|
||||
|
||||
```
|
||||
module "datadog-monitors-cloud-aws-lambda" {
|
||||
source = "git::ssh://git@git.fr.clara.net/claranet/pt-monitoring/projects/datadog/terraform/monitors.git//cloud/aws/lambda?ref={revision}"
|
||||
|
||||
environment = var.environment
|
||||
message = module.datadog-message-alerting.alerting-message
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Purpose
|
||||
|
||||
Creates DataDog monitors with the following checks:
|
||||
|
||||
- Lambda Invocations throttled due to concurrent limit reached
|
||||
- Lambda Number of errors
|
||||
- Lambda Number of invocations (disabled by default)
|
||||
- Lambda Percentage of errors
|
||||
|
||||
## Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| environment | Architecture environment | string | n/a | yes |
|
||||
| errors\_enabled | Flag to enable Errors monitor | string | `"false"` | no |
|
||||
| errors\_extra\_tags | Extra tags for Errors monitor | list(string) | `[]` | no |
|
||||
| errors\_message | Custom message for Errors monitor | string | `""` | no |
|
||||
| errors\_threshold\_critical | Alerting threshold in milliseconds | string | `"3"` | no |
|
||||
| errors\_threshold\_warning | Warning threshold in milliseconds | string | `"1"` | no |
|
||||
| errors\_time\_aggregator | Monitor aggregator for Errors [available values: min, max or avg] | string | `"sum"` | no |
|
||||
| errors\_timeframe | Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
|
||||
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
|
||||
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
|
||||
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
|
||||
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
|
||||
| invocations\_enabled | Flag to enable Invocations monitor | string | `"false"` | no |
|
||||
| invocations\_extra\_tags | Extra tags for Invocations monitor | list(string) | `[]` | no |
|
||||
| invocations\_message | Custom message for Invocations monitor | string | `""` | no |
|
||||
| invocations\_no\_data\_timeframe | Timeframe to check before alerting on no data in minutes | string | `"120"` | no |
|
||||
| invocations\_threshold\_critical | Alerting threshold in number of invocations | string | `"1"` | no |
|
||||
| invocations\_threshold\_warning | Warning threshold in number of invocations | string | `"2"` | no |
|
||||
| invocations\_time\_aggregator | Monitor aggregator for Invocations [available values: min, max or avg] | string | `"sum"` | no |
|
||||
| invocations\_timeframe | Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_30m"` | no |
|
||||
| message | Message sent when a monitor is triggered | string | n/a | yes |
|
||||
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
|
||||
| pct\_errors\_enabled | Flag to enable Percentage of errors monitor | string | `"true"` | no |
|
||||
| pct\_errors\_extra\_tags | Extra tags for Percentage of errors monitor | list(string) | `[]` | no |
|
||||
| pct\_errors\_message | Custom message for Percentage of errors monitor | string | `""` | no |
|
||||
| pct\_errors\_threshold\_critical | Alerting threshold in percentage | string | `"30"` | no |
|
||||
| pct\_errors\_threshold\_warning | Warning threshold in percentage | string | `"20"` | no |
|
||||
| pct\_errors\_time\_aggregator | Monitor aggregator for Percentage of errors [available values: min, max or avg] | string | `"sum"` | no |
|
||||
| pct\_errors\_timeframe | Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
|
||||
| prefix\_slug | Prefix string to prepend between brackets on every monitors names | string | `""` | no |
|
||||
| throttles\_enabled | Flag to enable Throttles monitor | string | `"true"` | no |
|
||||
| throttles\_extra\_tags | Extra tags for Throttles monitor | list(string) | `[]` | no |
|
||||
| throttles\_message | Custom message for Throttles monitor | string | `""` | no |
|
||||
| throttles\_threshold\_critical | Alerting threshold in number of throttles | string | `"3"` | no |
|
||||
| throttles\_threshold\_warning | Warning threshold in number of throttles | string | `"1"` | no |
|
||||
| throttles\_time\_aggregator | Monitor aggregator for Throttles [available values: min, max or avg] | string | `"sum"` | no |
|
||||
| throttles\_timeframe | Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_1h"` | no |
|
||||
|
||||
## Outputs
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| errors\_id | id for monitor errors |
|
||||
| invocations\_id | id for monitor invocations |
|
||||
| pct\_errors\_id | id for monitor pct_errors |
|
||||
| throttles\_id | id for monitor throttles |
|
||||
|
||||
## Related documentation
|
||||
* [Datadog Documentation](https://docs.datadoghq.com/integrations/amazon_lambda/)
|
||||
* [Service documentation](https://docs.aws.amazon.com/lambda/index.html)
|
||||
211
cloud/aws/lambda/inputs.tf
Normal file
211
cloud/aws/lambda/inputs.tf
Normal file
@ -0,0 +1,211 @@
|
||||
# Datadog global variables
|
||||
|
||||
variable "environment" {
|
||||
description = "Architecture environment"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "filter_tags_use_defaults" {
|
||||
description = "Use default filter tags convention"
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom" {
|
||||
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "filter_tags_custom_excluded" {
|
||||
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "message" {
|
||||
description = "Message sent when a monitor is triggered"
|
||||
}
|
||||
|
||||
variable "evaluation_delay" {
|
||||
description = "Delay in seconds for the metric evaluation"
|
||||
default = 900
|
||||
}
|
||||
|
||||
variable "new_host_delay" {
|
||||
description = "Delay in seconds before monitor new resource"
|
||||
default = 300
|
||||
}
|
||||
|
||||
variable "prefix_slug" {
|
||||
description = "Prefix string to prepend between brackets on every monitors names"
|
||||
default = ""
|
||||
}
|
||||
|
||||
# Datadog monitors variables
|
||||
|
||||
# Percentage of errors
|
||||
variable "pct_errors_enabled" {
|
||||
description = "Flag to enable Percentage of errors monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "pct_errors_extra_tags" {
|
||||
description = "Extra tags for Percentage of errors monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "pct_errors_message" {
|
||||
description = "Custom message for Percentage of errors monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "pct_errors_time_aggregator" {
|
||||
description = "Monitor aggregator for Percentage of errors [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "pct_errors_timeframe" {
|
||||
description = "Monitor timeframe for Percentage of errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "pct_errors_threshold_critical" {
|
||||
default = 30
|
||||
description = "Alerting threshold in percentage"
|
||||
}
|
||||
|
||||
variable "pct_errors_threshold_warning" {
|
||||
default = 20
|
||||
description = "Warning threshold in percentage"
|
||||
}
|
||||
|
||||
# Errors count
|
||||
variable "errors_enabled" {
|
||||
description = "Flag to enable Errors monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "errors_extra_tags" {
|
||||
description = "Extra tags for Errors monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "errors_message" {
|
||||
description = "Custom message for Errors monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "errors_time_aggregator" {
|
||||
description = "Monitor aggregator for Errors [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "errors_timeframe" {
|
||||
description = "Monitor timeframe for Errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "errors_threshold_critical" {
|
||||
default = 3
|
||||
description = "Alerting threshold in milliseconds"
|
||||
}
|
||||
|
||||
variable "errors_threshold_warning" {
|
||||
default = 1
|
||||
description = "Warning threshold in milliseconds"
|
||||
}
|
||||
|
||||
# Throttles count
|
||||
variable "throttles_enabled" {
|
||||
description = "Flag to enable Throttles monitor"
|
||||
type = string
|
||||
default = "true"
|
||||
}
|
||||
|
||||
variable "throttles_extra_tags" {
|
||||
description = "Extra tags for Throttles monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "throttles_message" {
|
||||
description = "Custom message for Throttles monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "throttles_time_aggregator" {
|
||||
description = "Monitor aggregator for Throttles [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "throttles_timeframe" {
|
||||
description = "Monitor timeframe for Throttles [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_1h"
|
||||
}
|
||||
|
||||
variable "throttles_threshold_critical" {
|
||||
default = 3
|
||||
description = "Alerting threshold in number of throttles"
|
||||
}
|
||||
|
||||
variable "throttles_threshold_warning" {
|
||||
default = 1
|
||||
description = "Warning threshold in number of throttles"
|
||||
}
|
||||
|
||||
# Invocations
|
||||
variable "invocations_enabled" {
|
||||
description = "Flag to enable Invocations monitor"
|
||||
type = string
|
||||
default = "false"
|
||||
}
|
||||
|
||||
variable "invocations_extra_tags" {
|
||||
description = "Extra tags for Invocations monitor"
|
||||
type = list(string)
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "invocations_message" {
|
||||
description = "Custom message for Invocations monitor"
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "invocations_time_aggregator" {
|
||||
description = "Monitor aggregator for Invocations [available values: min, max or avg]"
|
||||
type = string
|
||||
default = "sum"
|
||||
}
|
||||
|
||||
variable "invocations_timeframe" {
|
||||
description = "Monitor timeframe for Invocations [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
|
||||
type = string
|
||||
default = "last_30m"
|
||||
}
|
||||
|
||||
variable "invocations_threshold_critical" {
|
||||
default = 1
|
||||
description = "Alerting threshold in number of invocations"
|
||||
}
|
||||
|
||||
variable "invocations_threshold_warning" {
|
||||
default = 2
|
||||
description = "Warning threshold in number of invocations"
|
||||
}
|
||||
|
||||
variable "invocations_no_data_timeframe" {
|
||||
default = 120
|
||||
description = "Timeframe to check before alerting on no data in minutes"
|
||||
}
|
||||
9
cloud/aws/lambda/modules.tf
Normal file
9
cloud/aws/lambda/modules.tf
Normal file
@ -0,0 +1,9 @@
|
||||
module "filter-tags" {
|
||||
source = "../../../common/filter-tags"
|
||||
|
||||
environment = var.environment
|
||||
resource = "aws_lambda"
|
||||
filter_tags_use_defaults = var.filter_tags_use_defaults
|
||||
filter_tags_custom = var.filter_tags_custom
|
||||
filter_tags_custom_excluded = var.filter_tags_custom_excluded
|
||||
}
|
||||
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
149
cloud/aws/lambda/monitors-lambda.tf
Normal file
@ -0,0 +1,149 @@
|
||||
# Errors Percent
|
||||
resource "datadog_monitor" "pct_errors" {
|
||||
count = var.pct_errors_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Percentage of errors {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.pct_errors_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.pct_errors_time_aggregator}(${var.pct_errors_timeframe}):
|
||||
default(
|
||||
(default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
|
||||
/
|
||||
default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),1))
|
||||
* 100,0)
|
||||
> ${var.pct_errors_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.pct_errors_threshold_critical
|
||||
warning = var.pct_errors_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.pct_errors_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# Errors Absolute Value
|
||||
resource "datadog_monitor" "errors" {
|
||||
count = var.errors_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of errors {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.errors_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.errors_time_aggregator}(${var.errors_timeframe}):
|
||||
default(sum:aws.lambda.errors${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
|
||||
> ${var.errors_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.errors_threshold_critical
|
||||
warning = var.errors_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.errors_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# Throttles
|
||||
resource "datadog_monitor" "throttles" {
|
||||
count = var.throttles_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Invocations throttled due to concurrent limit reached {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.throttles_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.throttles_time_aggregator}(${var.throttles_timeframe}):
|
||||
default(sum:aws.lambda.throttles${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
|
||||
> ${var.throttles_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.throttles_threshold_critical
|
||||
warning = var.throttles_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.throttles_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
|
||||
# INVOCATIONS
|
||||
resource "datadog_monitor" "invocations" {
|
||||
count = var.invocations_enabled == "true" ? 1 : 0
|
||||
name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Lambda Number of invocations {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
|
||||
type = "metric alert"
|
||||
message = coalesce(var.invocations_message, var.message)
|
||||
|
||||
query = <<EOQ
|
||||
${var.invocations_time_aggregator}(${var.invocations_timeframe}):
|
||||
default(sum:aws.lambda.invocations${module.filter-tags.query_alert} by {region,functionname}.as_count(),0)
|
||||
<= ${var.invocations_threshold_critical}
|
||||
EOQ
|
||||
|
||||
evaluation_delay = var.evaluation_delay
|
||||
new_host_delay = var.new_host_delay
|
||||
|
||||
thresholds = {
|
||||
critical = var.invocations_threshold_critical
|
||||
warning = var.invocations_threshold_warning
|
||||
}
|
||||
|
||||
notify_no_data = false
|
||||
require_full_window = false
|
||||
renotify_interval = 0
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
|
||||
|
||||
|
||||
tags = concat(["env:${var.environment}", "type:cloud", "provider:aws", "resource:lambda", "team:claranet", "created-by:terraform"], var.invocations_extra_tags)
|
||||
|
||||
lifecycle {
|
||||
ignore_changes = ["silenced"]
|
||||
}
|
||||
}
|
||||
20
cloud/aws/lambda/outputs.tf
Normal file
20
cloud/aws/lambda/outputs.tf
Normal file
@ -0,0 +1,20 @@
|
||||
output "errors_id" {
|
||||
description = "id for monitor errors"
|
||||
value = datadog_monitor.errors.*.id
|
||||
}
|
||||
|
||||
output "invocations_id" {
|
||||
description = "id for monitor invocations"
|
||||
value = datadog_monitor.invocations.*.id
|
||||
}
|
||||
|
||||
output "pct_errors_id" {
|
||||
description = "id for monitor pct_errors"
|
||||
value = datadog_monitor.pct_errors.*.id
|
||||
}
|
||||
|
||||
output "throttles_id" {
|
||||
description = "id for monitor throttles"
|
||||
value = datadog_monitor.throttles.*.id
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user