From 7fa04791e75d9cc852c95271b56ed9f7a30c216f Mon Sep 17 00:00:00 2001 From: Laurent Piroelle Date: Tue, 26 Feb 2019 11:57:21 +0100 Subject: [PATCH] MON-390 Monitors for Azure Functions --- README.md | 1 + cloud/azure/functions/README.md | 85 ++++++++ cloud/azure/functions/inputs.tf | 222 ++++++++++++++++++++ cloud/azure/functions/metrics.txt | 23 ++ cloud/azure/functions/modules.tf | 9 + cloud/azure/functions/monitors-functions.tf | 124 +++++++++++ cloud/azure/functions/outputs.tf | 19 ++ 7 files changed, 483 insertions(+) create mode 100644 cloud/azure/functions/README.md create mode 100644 cloud/azure/functions/inputs.tf create mode 100644 cloud/azure/functions/metrics.txt create mode 100644 cloud/azure/functions/modules.tf create mode 100644 cloud/azure/functions/monitors-functions.tf create mode 100644 cloud/azure/functions/outputs.tf diff --git a/README.md b/README.md index 1641b86..6141757 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ The `//` is very important, it's a terraform specific syntax used to separate gi - [datalakestore](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/datalakestore/) - [eventgrid](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventgrid/) - [eventhub](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/eventhub/) + - [functions](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/functions/) - [iothubs](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/iothubs/) - [keyvault](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/keyvault/) - [load-balancer](https://git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors/tree/master/cloud/azure/load-balancer/) diff --git a/cloud/azure/functions/README.md b/cloud/azure/functions/README.md new file mode 100644 index 0000000..ef84a6a --- /dev/null +++ b/cloud/azure/functions/README.md @@ -0,0 +1,85 @@ +# CLOUD AZURE FUNCTIONS DataDog monitors + +## How to use this module + +``` +module "datadog-monitors-cloud-azure-functions" { + source = "git::ssh://git@git.fr.clara.net/claranet/cloudnative/projects/datadog/terraform/monitors.git//cloud/azure/functions?ref={revision}" + + environment = "${var.environment}" + message = "${module.datadog-message-alerting.alerting-message}" + + functions_max_scale_count = 2 +} + +``` + +## Purpose + +Creates DataDog monitors with the following checks: + +- Function App connections count too high +- Function App HTTP 5xx errors too high +- Function App memory usage too high +- Function App threads count too high +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|:----:|:-----:|:-----:| +| environment | Architecture environment | string | n/a | yes | +| evaluation\_delay | Delay in seconds for the metric evaluation | string | `"900"` | no | +| filter\_tags\_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `"*"` | no | +| filter\_tags\_custom\_excluded | Tags excluded for custom filtering when filter_tags_use_defaults is false | string | `""` | no | +| filter\_tags\_use\_defaults | Use default filter tags convention | string | `"true"` | no | +| functions\_max\_scale\_count | Maximum number of Function instances | string | n/a | yes | +| high\_connections\_count\_enabled | Flag to enable Functions high connections count monitor | string | `"true"` | no | +| high\_connections\_count\_extra\_tags | Extra tags for Functions high connections count monitor | list | `[]` | no | +| high\_connections\_count\_message | Custom message for Functions high connections count monitor | string | `""` | no | +| high\_connections\_count\_silenced | Groups to mute for Functions high connections count monitor | map | `{}` | no | +| high\_connections\_count\_threshold\_critical | Alerting threshold for Functions high connections count | string | `"290"` | no | +| high\_connections\_count\_threshold\_warning | Warning threshold for Functions high connections count | string | `"250"` | no | +| high\_connections\_count\_time\_aggregator | Monitor aggregator for Functions high connections count [available values: min, max or avg] | string | `"min"` | no | +| high\_connections\_count\_timeframe | Monitor timeframe for Functions high connections count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| high\_threads\_count\_enabled | Flag to enable Functions high threads count monitor | string | `"true"` | no | +| high\_threads\_count\_extra\_tags | Extra tags for Functions high threads count monitor | list | `[]` | no | +| high\_threads\_count\_message | Custom message for Functions high threads count monitor | string | `""` | no | +| high\_threads\_count\_silenced | Groups to mute for Functions high threads count monitor | map | `{}` | no | +| high\_threads\_count\_threshold\_critical | Alerting threshold for Functions high threads count | string | `"510"` | no | +| high\_threads\_count\_threshold\_warning | Warning threshold for Functions high threads count | string | `"490"` | no | +| high\_threads\_count\_time\_aggregator | Monitor aggregator for Functions high threads count [available values: min, max or avg] | string | `"min"` | no | +| high\_threads\_count\_timeframe | Monitor timeframe for Functions high threads count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| http\_5xx\_errors\_rate\_enabled | Flag to enable Functions Http 5xx errors rate monitor | string | `"true"` | no | +| http\_5xx\_errors\_rate\_extra\_tags | Extra tags for Functions Http 5xx errors rate monitor | list | `[]` | no | +| http\_5xx\_errors\_rate\_message | Custom message for Functions Http 5xx errors rate monitor | string | `""` | no | +| http\_5xx\_errors\_rate\_silenced | Groups to mute for Functions Http 5xx errors rate monitor | map | `{}` | no | +| http\_5xx\_errors\_rate\_threshold\_critical | Alerting threshold for Functions Http 5xx errors rate | string | `"20"` | no | +| http\_5xx\_errors\_rate\_threshold\_warning | Warning threshold for Functions Http 5xx errors rate | string | `"10"` | no | +| http\_5xx\_errors\_rate\_time\_aggregator | Monitor aggregator for Functions Http 5xx errors rate [available values: min, max or avg] | string | `"min"` | no | +| http\_5xx\_errors\_rate\_timeframe | Monitor timeframe for Functions Http 5xx errors rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| memory\_usage\_enabled | Flag to enable Functions memory usage monitor | string | `"true"` | no | +| memory\_usage\_extra\_tags | Extra tags for Functions memory usage monitor | list | `[]` | no | +| memory\_usage\_message | Custom message for Functions memory usage monitor | string | `""` | no | +| memory\_usage\_silenced | Groups to mute for Functions memory usage monitor | map | `{}` | no | +| memory\_usage\_threshold\_critical | Alerting threshold for Functions memory usage in bytes | string | `"125829120"` | no | +| memory\_usage\_threshold\_warning | Warning threshold for Functions memory usage in bytes | string | `"104857600"` | no | +| memory\_usage\_time\_aggregator | Monitor aggregator for Functions memory usage [available values: min, max or avg] | string | `"min"` | no | +| memory\_usage\_timeframe | Monitor timeframe for Functions memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | string | `"last_5m"` | no | +| message | Message sent when a monitor is triggered | string | n/a | yes | +| new\_host\_delay | Delay in seconds before monitor new resource | string | `"300"` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| function\_high\_connections\_count\_id | id for monitor function_high_connections_count | +| function\_high\_threads\_count\_id | id for monitor function_high_threads_count | +| function\_http\_5xx\_errors\_rate\_id | id for monitor function_http_5xx_errors_rate | +| function\_memory\_usage\_id | id for monitor function_memory_usage | + +## Related documentation + +Datadog Azure documentation: [https://docs.datadoghq.com/integrations/azure/](https://docs.datadoghq.com/integrations/azure/) + +Azure Monitor metrics: [https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsoftwebsites-functions](https://docs.microsoft.com/en-us/azure/azure-monitor/platform/metrics-supported#microsoftwebsites-functions) + +Azure Functions connections limits: [https://docs.microsoft.com/en-us/azure/azure-functions/manage-connections#connections-limit](https://docs.microsoft.com/en-us/azure/azure-functions/manage-connections#connections-limit) diff --git a/cloud/azure/functions/inputs.tf b/cloud/azure/functions/inputs.tf new file mode 100644 index 0000000..ebe0aff --- /dev/null +++ b/cloud/azure/functions/inputs.tf @@ -0,0 +1,222 @@ +variable "environment" { + description = "Architecture environment" + type = "string" +} + +variable "filter_tags_use_defaults" { + description = "Use default filter tags convention" + default = "true" +} + +variable "filter_tags_custom" { + description = "Tags used for custom filtering when filter_tags_use_defaults is false" + default = "*" +} + +variable "filter_tags_custom_excluded" { + description = "Tags excluded for custom filtering when filter_tags_use_defaults is false" + default = "" +} + +variable "message" { + description = "Message sent when a monitor is triggered" +} + +variable "evaluation_delay" { + description = "Delay in seconds for the metric evaluation" + default = 900 +} + +variable "new_host_delay" { + description = "Delay in seconds before monitor new resource" + default = 300 +} + +variable "functions_max_scale_count" { + description = "Maximum number of Function instances" +} + +# Azure Function App specific variables +variable "http_5xx_errors_rate_silenced" { + description = "Groups to mute for Functions Http 5xx errors rate monitor" + type = "map" + default = {} +} + +variable "http_5xx_errors_rate_enabled" { + description = "Flag to enable Functions Http 5xx errors rate monitor" + type = "string" + default = "true" +} + +variable "http_5xx_errors_rate_extra_tags" { + description = "Extra tags for Functions Http 5xx errors rate monitor" + type = "list" + default = [] +} + +variable "http_5xx_errors_rate_message" { + description = "Custom message for Functions Http 5xx errors rate monitor" + type = "string" + default = "" +} + +variable "http_5xx_errors_rate_time_aggregator" { + description = "Monitor aggregator for Functions Http 5xx errors rate [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "http_5xx_errors_rate_timeframe" { + description = "Monitor timeframe for Functions Http 5xx errors rate [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "http_5xx_errors_rate_threshold_critical" { + default = 20 + description = "Alerting threshold for Functions Http 5xx errors rate" +} + +variable "http_5xx_errors_rate_threshold_warning" { + default = 10 + description = "Warning threshold for Functions Http 5xx errors rate" +} + +variable "high_connections_count_silenced" { + description = "Groups to mute for Functions high connections count monitor" + type = "map" + default = {} +} + +variable "high_connections_count_enabled" { + description = "Flag to enable Functions high connections count monitor" + type = "string" + default = "true" +} + +variable "high_connections_count_extra_tags" { + description = "Extra tags for Functions high connections count monitor" + type = "list" + default = [] +} + +variable "high_connections_count_message" { + description = "Custom message for Functions high connections count monitor" + type = "string" + default = "" +} + +variable "high_connections_count_time_aggregator" { + description = "Monitor aggregator for Functions high connections count [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "high_connections_count_timeframe" { + description = "Monitor timeframe for Functions high connections count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "high_connections_count_threshold_critical" { + default = 290 + description = "Alerting threshold for Functions high connections count" +} + +variable "high_connections_count_threshold_warning" { + default = 250 + description = "Warning threshold for Functions high connections count" +} + +variable "high_threads_count_silenced" { + description = "Groups to mute for Functions high threads count monitor" + type = "map" + default = {} +} + +variable "high_threads_count_enabled" { + description = "Flag to enable Functions high threads count monitor" + type = "string" + default = "true" +} + +variable "high_threads_count_extra_tags" { + description = "Extra tags for Functions high threads count monitor" + type = "list" + default = [] +} + +variable "high_threads_count_message" { + description = "Custom message for Functions high threads count monitor" + type = "string" + default = "" +} + +variable "high_threads_count_time_aggregator" { + description = "Monitor aggregator for Functions high threads count [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "high_threads_count_timeframe" { + description = "Monitor timeframe for Functions high threads count [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "high_threads_count_threshold_critical" { + default = 510 + description = "Alerting threshold for Functions high threads count" +} + +variable "high_threads_count_threshold_warning" { + default = 490 + description = "Warning threshold for Functions high threads count" +} + +variable "memory_usage_silenced" { + description = "Groups to mute for Functions memory usage monitor" + type = "map" + default = {} +} + +variable "memory_usage_enabled" { + description = "Flag to enable Functions memory usage monitor" + type = "string" + default = "true" +} + +variable "memory_usage_extra_tags" { + description = "Extra tags for Functions memory usage monitor" + type = "list" + default = [] +} + +variable "memory_usage_message" { + description = "Custom message for Functions memory usage monitor" + type = "string" + default = "" +} + +variable "memory_usage_time_aggregator" { + description = "Monitor aggregator for Functions memory usage [available values: min, max or avg]" + type = "string" + default = "min" +} + +variable "memory_usage_timeframe" { + description = "Monitor timeframe for Functions memory usage [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = "string" + default = "last_5m" +} + +variable "memory_usage_threshold_critical" { + default = 125829120 # 120Mb + description = "Alerting threshold for Functions memory usage in bytes" +} + +variable "memory_usage_threshold_warning" { + default = 104857600 # 100Mb + description = "Warning threshold for Functions memory usage in bytes" +} diff --git a/cloud/azure/functions/metrics.txt b/cloud/azure/functions/metrics.txt new file mode 100644 index 0000000..20e5575 --- /dev/null +++ b/cloud/azure/functions/metrics.txt @@ -0,0 +1,23 @@ +azure.functions.average_memory_working_set +azure.functions.bytes_received +azure.functions.bytes_sent +azure.functions.connections +azure.functions.current_assemblies +azure.functions.function_execution_count +azure.functions.function_execution_units +azure.functions.gen_0_garbage_collections +azure.functions.gen_1_garbage_collections +azure.functions.gen_2_garbage_collections +azure.functions.handle_count +azure.functions.http5xx +azure.functions.io_other_bytes_per_second +azure.functions.io_other_operations_per_second +azure.functions.io_read_bytes_per_second +azure.functions.io_read_operations_per_second +azure.functions.io_write_bytes_per_second +azure.functions.io_write_operations_per_second +azure.functions.memory_working_set +azure.functions.private_bytes +azure.functions.thread_count +azure.functions.total_app_domains +azure.functions.total_app_domains_unloaded diff --git a/cloud/azure/functions/modules.tf b/cloud/azure/functions/modules.tf new file mode 100644 index 0000000..b68c9e6 --- /dev/null +++ b/cloud/azure/functions/modules.tf @@ -0,0 +1,9 @@ +module "filter-tags" { + source = "../../../common/filter-tags" + + environment = "${var.environment}" + resource = "azure_functions" + filter_tags_use_defaults = "${var.filter_tags_use_defaults}" + filter_tags_custom = "${var.filter_tags_custom}" + filter_tags_custom_excluded = "${var.filter_tags_custom_excluded}" +} diff --git a/cloud/azure/functions/monitors-functions.tf b/cloud/azure/functions/monitors-functions.tf new file mode 100644 index 0000000..0bd908a --- /dev/null +++ b/cloud/azure/functions/monitors-functions.tf @@ -0,0 +1,124 @@ +resource "datadog_monitor" "function_http_5xx_errors_rate" { + count = "${var.http_5xx_errors_rate_enabled ? 1 : 0}" + name = "[${var.environment}] Function App HTTP 5xx errors too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + type = "metric alert" + message = "${coalesce(var.http_5xx_errors_rate_message, var.message)}" + + query = < ${var.http_5xx_errors_rate_threshold_critical} + EOF + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + thresholds { + warning = "${var.http_5xx_errors_rate_threshold_warning}" + critical = "${var.http_5xx_errors_rate_threshold_critical}" + } + + silenced = "${var.http_5xx_errors_rate_silenced}" + + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.http_5xx_errors_rate_extra_tags}"] +} + +resource "datadog_monitor" "function_high_connections_count" { + count = "${var.high_connections_count_enabled ? 1 : 0}" + name = "[${var.environment}] Function App connections count too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + type = "metric alert" + message = "${coalesce(var.high_connections_count_message, var.message)}" + + query = < ${var.high_connections_count_threshold_critical} + EOF + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + thresholds { + warning = "${var.high_connections_count_threshold_warning}" + critical = "${var.high_connections_count_threshold_critical}" + } + + silenced = "${var.high_connections_count_silenced}" + + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.high_connections_count_extra_tags}"] +} + +resource "datadog_monitor" "function_high_threads_count" { + count = "${var.high_threads_count_enabled ? 1 : 0}" + name = "[${var.environment}] Function App threads count too high {{#is_alert}}{{{comparator}}} {{threshold}}% ({{value}}%){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}}% ({{value}}%){{/is_warning}}" + type = "metric alert" + message = "${coalesce(var.high_threads_count_message, var.message)}" + + query = < ${var.high_threads_count_threshold_critical} + EOF + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + thresholds { + warning = "${var.high_threads_count_threshold_warning}" + critical = "${var.high_threads_count_threshold_critical}" + } + + silenced = "${var.high_threads_count_silenced}" + + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 1 + include_tags = true + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.high_threads_count_extra_tags}"] +} + +resource "datadog_monitor" "function_memory_usage" { + count = "${var.memory_usage_enabled ? 1 : 0}" + name = "[${var.environment}] Function App memory usage too high {{#is_alert}}{{{comparator}}} {{threshold}} ({{value}}){{/is_alert}}{{#is_warning}}{{{comparator}}} {{warn_threshold}} ({{value}}){{/is_warning}}" + type = "metric alert" + message = "${coalesce(var.memory_usage_message, var.message)}" + + query = < ${var.memory_usage_threshold_critical} + EOF + + evaluation_delay = "${var.evaluation_delay}" + new_host_delay = "${var.new_host_delay}" + + thresholds { + warning = "${var.memory_usage_threshold_warning}" + critical = "${var.memory_usage_threshold_critical}" + } + + silenced = "${var.memory_usage_silenced}" + + notify_no_data = false + renotify_interval = 0 + require_full_window = false + timeout_h = 0 + include_tags = true + + tags = ["env:${var.environment}", "type:cloud", "provider:azure", "resource:azure_functions", "team:claranet", "created-by:terraform", "${var.memory_usage_extra_tags}"] +} diff --git a/cloud/azure/functions/outputs.tf b/cloud/azure/functions/outputs.tf new file mode 100644 index 0000000..09ebb70 --- /dev/null +++ b/cloud/azure/functions/outputs.tf @@ -0,0 +1,19 @@ +output "function_http_5xx_errors_rate_id" { + description = "id for monitor function_http_5xx_errors_rate" + value = "${datadog_monitor.function_http_5xx_errors_rate.*.id}" +} + +output "function_high_connections_count_id" { + description = "id for monitor function_high_connections_count" + value = "${datadog_monitor.function_high_connections_count.*.id}" +} + +output "function_high_threads_count_id" { + description = "id for monitor function_high_threads_count" + value = "${datadog_monitor.function_high_threads_count.*.id}" +} + +output "function_memory_usage_id" { + description = "id for monitor function_memory_usage" + value = "${datadog_monitor.function_memory_usage.*.id}" +}