MON-390 Monitors for Azure Functions

This commit is contained in:
Laurent Piroelle 2019-02-26 11:57:21 +01:00
parent 75c59e4bda
commit 7fa04791e7
7 changed files with 483 additions and 0 deletions

View File

@ -103,6 +103,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi
- [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/)
- [eventgrid](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventgrid/)
- [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/)
- [functions](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/functions/)
- [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/)
- [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/)
- [load-balancer](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/load-balancer/)

View File

@ -0,0 +1,85 @@
# CLOUD AZURE FUNCTIONS DataDog monitors
## How to use this module
```
module "datadog-monitors-cloud-azure-functions" {
source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/functions?ref={revision}"
environment = "${var.environment}"
message = "${module.datadog-message-alerting.alerting-message}"
functions_max_scale_count = 2
}
```
## Purpose
Creates DataDog monitors with the following checks:
- Function App connections count too high
- Function App HTTP 5xx errors too high
- Function App memory usage too high
- Function App threads count too high
## Inputs
| Name | Description | Type | Default | Required |
|------|-------------|:----:|:-----:|:-----:|
| environment | Architecture environment | string | n/a | yes |
| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no |
| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no |
| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no |
| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no |
| functions\_max\_scale\_count | Maximum number of Function instances | string | n/a | yes |
| high\_connections\_count\_enabled | Flag to enable Functions high connections count monitor | string | `"true"` | no |
| high\_connections\_count\_extra\_tags | Extra tags for Functions high connections count monitor | list | `[]` | no |
| high\_connections\_count\_message | Custom message for Functions high connections count monitor | string | `""` | no |
| high\_connections\_count\_silenced | Groups to mute for Functions high connections count monitor | map | `{}` | no |
| high\_connections\_count\_threshold\_critical | Alerting threshold for Functions high connections count | string | `"290"` | no |
| high\_connections\_count\_threshold\_warning | Warning threshold for Functions high connections count | string | `"250"` | no |
| high\_connections\_count\_time\_aggregator | Monitor aggregator for Functions high connections count [available values: min, max or avg] | string | `"min"` | no |
| high\_connections\_count\_timeframe | Monitor timeframe for Functions high connections count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| high\_threads\_count\_enabled | Flag to enable Functions high threads count monitor | string | `"true"` | no |
| high\_threads\_count\_extra\_tags | Extra tags for Functions high threads count monitor | list | `[]` | no |
| high\_threads\_count\_message | Custom message for Functions high threads count monitor | string | `""` | no |
| high\_threads\_count\_silenced | Groups to mute for Functions high threads count monitor | map | `{}` | no |
| high\_threads\_count\_threshold\_critical | Alerting threshold for Functions high threads count | string | `"510"` | no |
| high\_threads\_count\_threshold\_warning | Warning threshold for Functions high threads count | string | `"490"` | no |
| high\_threads\_count\_time\_aggregator | Monitor aggregator for Functions high threads count [available values: min, max or avg] | string | `"min"` | no |
| high\_threads\_count\_timeframe | Monitor timeframe for Functions high threads count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| http\_5xx\_errors\_rate\_enabled | Flag to enable Functions Http 5xx errors rate monitor | string | `"true"` | no |
| http\_5xx\_errors\_rate\_extra\_tags | Extra tags for Functions Http 5xx errors rate monitor | list | `[]` | no |
| http\_5xx\_errors\_rate\_message | Custom message for Functions Http 5xx errors rate monitor | string | `""` | no |
| http\_5xx\_errors\_rate\_silenced | Groups to mute for Functions Http 5xx errors rate monitor | map | `{}` | no |
| http\_5xx\_errors\_rate\_threshold\_critical | Alerting threshold for Functions Http 5xx errors rate | string | `"20"` | no |
| http\_5xx\_errors\_rate\_threshold\_warning | Warning threshold for Functions Http 5xx errors rate | string | `"10"` | no |
| http\_5xx\_errors\_rate\_time\_aggregator | Monitor aggregator for Functions Http 5xx errors rate [available values: min, max or avg] | string | `"min"` | no |
| http\_5xx\_errors\_rate\_timeframe | Monitor timeframe for Functions Http 5xx errors rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| memory\_usage\_enabled | Flag to enable Functions memory usage monitor | string | `"true"` | no |
| memory\_usage\_extra\_tags | Extra tags for Functions memory usage monitor | list | `[]` | no |
| memory\_usage\_message | Custom message for Functions memory usage monitor | string | `""` | no |
| memory\_usage\_silenced | Groups to mute for Functions memory usage monitor | map | `{}` | no |
| memory\_usage\_threshold\_critical | Alerting threshold for Functions memory usage in bytes | string | `"125829120"` | no |
| memory\_usage\_threshold\_warning | Warning threshold for Functions memory usage in bytes | string | `"104857600"` | no |
| memory\_usage\_time\_aggregator | Monitor aggregator for Functions memory usage [available values: min, max or avg] | string | `"min"` | no |
| memory\_usage\_timeframe | Monitor timeframe for Functions memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no |
| message | Message sent when a monitor is triggered | string | n/a | yes |
| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no |
## Outputs
| Name | Description |
|------|-------------|
| function\_high\_connections\_count\_id | id for monitor function_high_connections_count |
| function\_high\_threads\_count\_id | id for monitor function_high_threads_count |
| function\_http\_5xx\_errors\_rate\_id | id for monitor function_http_5xx_errors_rate |
| function\_memory\_usage\_id | id for monitor function_memory_usage |
## Related documentation
Datadog Azure documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/)
Azure Monitor metrics: [https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsoftwebsites-functions](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsoftwebsites-functions)
Azure Functions connections limits: [https://docs.microsoft.com/en-us/azure/azure-functions/manage-connections#connections-limit](https://docs.microsoft.com/en-us/azure/azure-functions/manage-connections#connections-limit)

View File

@ -0,0 +1,222 @@
variable "environment" {
description = "Architecture environment"
type = "string"
}
variable "filter_tags_use_defaults" {
description = "Use default filter tags convention"
default = "true"
}
variable "filter_tags_custom" {
description = "Tags used for custom filtering when filter_tags_use_defaults is false"
default = "*"
}
variable "filter_tags_custom_excluded" {
description = "Tags excluded for custom filtering when filter_tags_use_defaults is false"
default = ""
}
variable "message" {
description = "Message sent when a monitor is triggered"
}
variable "evaluation_delay" {
description = "Delay in seconds for the metric evaluation"
default = 900
}
variable "new_host_delay" {
description = "Delay in seconds before monitor new resource"
default = 300
}
variable "functions_max_scale_count" {
description = "Maximum number of Function instances"
}
# Azure Function App specific variables
variable "http_5xx_errors_rate_silenced" {
description = "Groups to mute for Functions Http 5xx errors rate monitor"
type = "map"
default = {}
}
variable "http_5xx_errors_rate_enabled" {
description = "Flag to enable Functions Http 5xx errors rate monitor"
type = "string"
default = "true"
}
variable "http_5xx_errors_rate_extra_tags" {
description = "Extra tags for Functions Http 5xx errors rate monitor"
type = "list"
default = []
}
variable "http_5xx_errors_rate_message" {
description = "Custom message for Functions Http 5xx errors rate monitor"
type = "string"
default = ""
}
variable "http_5xx_errors_rate_time_aggregator" {
description = "Monitor aggregator for Functions Http 5xx errors rate [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "http_5xx_errors_rate_timeframe" {
description = "Monitor timeframe for Functions Http 5xx errors rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "http_5xx_errors_rate_threshold_critical" {
default = 20
description = "Alerting threshold for Functions Http 5xx errors rate"
}
variable "http_5xx_errors_rate_threshold_warning" {
default = 10
description = "Warning threshold for Functions Http 5xx errors rate"
}
variable "high_connections_count_silenced" {
description = "Groups to mute for Functions high connections count monitor"
type = "map"
default = {}
}
variable "high_connections_count_enabled" {
description = "Flag to enable Functions high connections count monitor"
type = "string"
default = "true"
}
variable "high_connections_count_extra_tags" {
description = "Extra tags for Functions high connections count monitor"
type = "list"
default = []
}
variable "high_connections_count_message" {
description = "Custom message for Functions high connections count monitor"
type = "string"
default = ""
}
variable "high_connections_count_time_aggregator" {
description = "Monitor aggregator for Functions high connections count [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "high_connections_count_timeframe" {
description = "Monitor timeframe for Functions high connections count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "high_connections_count_threshold_critical" {
default = 290
description = "Alerting threshold for Functions high connections count"
}
variable "high_connections_count_threshold_warning" {
default = 250
description = "Warning threshold for Functions high connections count"
}
variable "high_threads_count_silenced" {
description = "Groups to mute for Functions high threads count monitor"
type = "map"
default = {}
}
variable "high_threads_count_enabled" {
description = "Flag to enable Functions high threads count monitor"
type = "string"
default = "true"
}
variable "high_threads_count_extra_tags" {
description = "Extra tags for Functions high threads count monitor"
type = "list"
default = []
}
variable "high_threads_count_message" {
description = "Custom message for Functions high threads count monitor"
type = "string"
default = ""
}
variable "high_threads_count_time_aggregator" {
description = "Monitor aggregator for Functions high threads count [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "high_threads_count_timeframe" {
description = "Monitor timeframe for Functions high threads count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "high_threads_count_threshold_critical" {
default = 510
description = "Alerting threshold for Functions high threads count"
}
variable "high_threads_count_threshold_warning" {
default = 490
description = "Warning threshold for Functions high threads count"
}
variable "memory_usage_silenced" {
description = "Groups to mute for Functions memory usage monitor"
type = "map"
default = {}
}
variable "memory_usage_enabled" {
description = "Flag to enable Functions memory usage monitor"
type = "string"
default = "true"
}
variable "memory_usage_extra_tags" {
description = "Extra tags for Functions memory usage monitor"
type = "list"
default = []
}
variable "memory_usage_message" {
description = "Custom message for Functions memory usage monitor"
type = "string"
default = ""
}
variable "memory_usage_time_aggregator" {
description = "Monitor aggregator for Functions memory usage [available values: min, max or avg]"
type = "string"
default = "min"
}
variable "memory_usage_timeframe" {
description = "Monitor timeframe for Functions memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]"
type = "string"
default = "last_5m"
}
variable "memory_usage_threshold_critical" {
default = 125829120 # 120Mb
description = "Alerting threshold for Functions memory usage in bytes"
}
variable "memory_usage_threshold_warning" {
default = 104857600 # 100Mb
description = "Warning threshold for Functions memory usage in bytes"
}

View File

@ -0,0 +1,23 @@
azure.functions.average_memory_working_set
azure.functions.bytes_received
azure.functions.bytes_sent
azure.functions.connections
azure.functions.current_assemblies
azure.functions.function_execution_count
azure.functions.function_execution_units
azure.functions.gen_0_garbage_collections
azure.functions.gen_1_garbage_collections
azure.functions.gen_2_garbage_collections
azure.functions.handle_count
azure.functions.http5xx
azure.functions.io_other_bytes_per_second
azure.functions.io_other_operations_per_second
azure.functions.io_read_bytes_per_second
azure.functions.io_read_operations_per_second
azure.functions.io_write_bytes_per_second
azure.functions.io_write_operations_per_second
azure.functions.memory_working_set
azure.functions.private_bytes
azure.functions.thread_count
azure.functions.total_app_domains
azure.functions.total_app_domains_unloaded

View File

@ -0,0 +1,9 @@
module "filter-tags" {
source = "../../../common/filter-tags"
environment = "${var.environment}"
resource = "azure_functions"
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
filter_tags_custom = "${var.filter_tags_custom}"
filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}"
}

View File

@ -0,0 +1,124 @@
resource "datadog_monitor" "function_http_5xx_errors_rate" {
count = "${var.http_5xx_errors_rate_enabled ? 1 : 0}"
name = "[${var.environment}] Function App HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.http_5xx_errors_rate_message, var.message)}"
query = <<EOF
${var.http_5xx_errors_rate_time_aggregator}(${var.http_5xx_errors_rate_timeframe}): default(
default(avg:azure.functions.http5xx${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) /
default(avg:azure.functions.function_execution_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 1)
, 0) * 100 > ${var.http_5xx_errors_rate_threshold_critical}
EOF
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
thresholds {
warning = "${var.http_5xx_errors_rate_threshold_warning}"
critical = "${var.http_5xx_errors_rate_threshold_critical}"
}
silenced = "${var.http_5xx_errors_rate_silenced}"
notify_no_data = false
renotify_interval = 0
require_full_window = false
timeout_h = 1
include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.http_5xx_errors_rate_extra_tags}"]
}
resource "datadog_monitor" "function_high_connections_count" {
count = "${var.high_connections_count_enabled ? 1 : 0}"
name = "[${var.environment}] Function App connections count too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.high_connections_count_message, var.message)}"
query = <<EOF
${var.high_connections_count_time_aggregator}(${var.high_connections_count_timeframe}):
default(azure.functions.connections${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ${var.functions_max_scale_count}
> ${var.high_connections_count_threshold_critical}
EOF
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
thresholds {
warning = "${var.high_connections_count_threshold_warning}"
critical = "${var.high_connections_count_threshold_critical}"
}
silenced = "${var.high_connections_count_silenced}"
notify_no_data = false
renotify_interval = 0
require_full_window = false
timeout_h = 1
include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.high_connections_count_extra_tags}"]
}
resource "datadog_monitor" "function_high_threads_count" {
count = "${var.high_threads_count_enabled ? 1 : 0}"
name = "[${var.environment}] Function App threads count too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.high_threads_count_message, var.message)}"
query = <<EOF
${var.high_threads_count_time_aggregator}(${var.high_threads_count_timeframe}):
default(azure.functions.thread_count${module.filter-tags.query_alert} by {resource_group,region,name}.as_rate(), 0) / ${var.functions_max_scale_count}
> ${var.high_threads_count_threshold_critical}
EOF
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
thresholds {
warning = "${var.high_threads_count_threshold_warning}"
critical = "${var.high_threads_count_threshold_critical}"
}
silenced = "${var.high_threads_count_silenced}"
notify_no_data = false
renotify_interval = 0
require_full_window = false
timeout_h = 1
include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.high_threads_count_extra_tags}"]
}
resource "datadog_monitor" "function_memory_usage" {
count = "${var.memory_usage_enabled ? 1 : 0}"
name = "[${var.environment}] Function App memory usage too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}"
type = "metric alert"
message = "${coalesce(var.memory_usage_message, var.message)}"
query = <<EOF
${var.memory_usage_time_aggregator}(${var.memory_usage_timeframe}): (
avg:azure.functions.average_memory_working_set${module.filter-tags.query_alert} by {resource_group,region,name}
) > ${var.memory_usage_threshold_critical}
EOF
evaluation_delay = "${var.evaluation_delay}"
new_host_delay = "${var.new_host_delay}"
thresholds {
warning = "${var.memory_usage_threshold_warning}"
critical = "${var.memory_usage_threshold_critical}"
}
silenced = "${var.memory_usage_silenced}"
notify_no_data = false
renotify_interval = 0
require_full_window = false
timeout_h = 0
include_tags = true
tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.memory_usage_extra_tags}"]
}

View File

@ -0,0 +1,19 @@
output "function_http_5xx_errors_rate_id" {
description = "id for monitor function_http_5xx_errors_rate"
value = "${datadog_monitor.function_http_5xx_errors_rate.*.id}"
}
output "function_high_connections_count_id" {
description = "id for monitor function_high_connections_count"
value = "${datadog_monitor.function_high_connections_count.*.id}"
}
output "function_high_threads_count_id" {
description = "id for monitor function_high_threads_count"
value = "${datadog_monitor.function_high_threads_count.*.id}"
}
output "function_memory_usage_id" {
description = "id for monitor function_memory_usage"
value = "${datadog_monitor.function_memory_usage.*.id}"
}