Merged in MON-79-azure-storage-monitors-set-warning-thresholds (pull request #46)
MON-79: Azure Storage - Raise critical thresholds and add warning thresholds to avoid "bagot" alerting during NBH Approved-by: Laurent Piroelle <laurent.piroelle@fr.clara.net> Approved-by: Jérôme Respaut <shr3ps@gmail.com> Approved-by: Alexandre Gaillet <alexandre.gaillet@fr.clara.net> Approved-by: Quentin Manfroi <quentin.manfroi@yahoo.fr>
This commit is contained in:
commit
341a0caafc
@ -31,82 +31,144 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
|
||||
| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
|
||||
| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
|
||||
| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
|
||||
| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no |
|
||||
| appservices_http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no |
|
||||
| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
|
||||
| appservices_http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
|
||||
| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
|
||||
| appservices_http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
|
||||
| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
|
||||
| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
|
||||
| appservices_response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
|
||||
| appservices_response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
|
||||
| apimanagement_failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `<map>` | no |
|
||||
| apimanagement_failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
|
||||
| apimanagement_failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
|
||||
| apimanagement_other_requests_silenced | Groups to mute for API Management other requests monitor | map | `<map>` | no |
|
||||
| apimanagement_other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
|
||||
| apimanagement_other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
|
||||
| apimanagement_status_silenced | Groups to mute for API Management status monitor | map | `<map>` | no |
|
||||
| apimanagement_successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `<map>` | no |
|
||||
| apimanagement_successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
|
||||
| apimanagement_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
|
||||
| apimanagement_unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `<map>` | no |
|
||||
| apimanagement_unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
|
||||
| apimanagement_unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
|
||||
| appservices_http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `<map>` | no |
|
||||
| appservices_http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
|
||||
| appservices_http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
|
||||
| appservices_http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `<map>` | no |
|
||||
| appservices_http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
|
||||
| appservices_http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
|
||||
| appservices_http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `<map>` | no |
|
||||
| appservices_http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
|
||||
| appservices_http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
|
||||
| appservices_memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `<map>` | no |
|
||||
| appservices_memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no |
|
||||
| appservices_memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no |
|
||||
| appservices_response_time_silenced | Groups to mute for App Services response time monitor | map | `<map>` | no |
|
||||
| appservices_response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no |
|
||||
| appservices_response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||
| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||
| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||
| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||
| eventhub_errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `<map>` | no |
|
||||
| eventhub_errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||
| eventhub_errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||
| eventhub_failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `<map>` | no |
|
||||
| eventhub_failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||
| eventhub_failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||
| eventhub_status_silenced | Groups to mute for Event Hub status monitor | map | `<map>` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| iothub_dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `<map>` | no |
|
||||
| iothub_dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
|
||||
| iothub_dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
|
||||
| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| iothub_failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `<map>` | no |
|
||||
| iothub_failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `<map>` | no |
|
||||
| iothub_failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `<map>` | no |
|
||||
| iothub_failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `<map>` | no |
|
||||
| iothub_failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `<map>` | no |
|
||||
| iothub_failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `<map>` | no |
|
||||
| iothub_failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `<map>` | no |
|
||||
| iothub_failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `<map>` | no |
|
||||
| iothub_failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| iothub_failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| iothub_fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `<map>` | no |
|
||||
| iothub_fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
|
||||
| iothub_fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
|
||||
| iothub_invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `<map>` | no |
|
||||
| iothub_invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
|
||||
| iothub_invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
|
||||
| iothub_orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `<map>` | no |
|
||||
| iothub_orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
|
||||
| iothub_orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
|
||||
| iothub_status_silenced | Groups to mute for IoT Hub status monitor | map | `<map>` | no |
|
||||
| iothub_too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `<map>` | no |
|
||||
| iothub_total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `<map>` | no |
|
||||
| message | Message sent when a monitor is triggered | string | - | yes |
|
||||
| non_taggable_filter_tags | Tags used for filtering for components without tag support | string | `*` | no |
|
||||
| redis_evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `<map>` | no |
|
||||
| redis_evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
|
||||
| redis_evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
|
||||
| redis_percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `<map>` | no |
|
||||
| redis_percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
|
||||
| redis_percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
|
||||
| redis_server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `<map>` | no |
|
||||
| redis_server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
|
||||
| redis_server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
|
||||
| sqldatabase_cpu_silenced | Groups to mute for SQL CPU monitor | map | `<map>` | no |
|
||||
| sqldatabase_cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
|
||||
| sqldatabase_cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
|
||||
| sqldatabase_deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `<map>` | no |
|
||||
| sqldatabase_deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
|
||||
| sqldatabase_diskspace_silenced | Groups to mute for SQL disk space monitor | map | `<map>` | no |
|
||||
| sqldatabase_diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
|
||||
| sqldatabase_diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
|
||||
| sqldatabase_dtu_silenced | Groups to mute for SQL DTU monitor | map | `<map>` | no |
|
||||
| sqldatabase_dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
|
||||
| sqldatabase_dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
|
||||
| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no |
|
||||
| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no |
|
||||
| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no |
|
||||
| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no |
|
||||
| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no |
|
||||
| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no |
|
||||
| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no |
|
||||
| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no |
|
||||
| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no |
|
||||
| storage_authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `<map>` | no |
|
||||
| storage_authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
|
||||
| storage_authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
|
||||
| storage_availability_silenced | Groups to mute for Storage availability monitor | map | `<map>` | no |
|
||||
| storage_availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
|
||||
| storage_availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
|
||||
| storage_client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `<map>` | no |
|
||||
| storage_client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
|
||||
| storage_client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
|
||||
| storage_latency_silenced | Groups to mute for Storage latency monitor | map | `<map>` | no |
|
||||
| storage_latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
|
||||
| storage_latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
|
||||
| storage_network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `<map>` | no |
|
||||
| storage_network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
|
||||
| storage_network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
|
||||
| storage_server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `<map>` | no |
|
||||
| storage_server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
|
||||
| storage_server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
|
||||
| storage_successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `<map>` | no |
|
||||
| storage_successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
|
||||
| storage_successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
|
||||
| storage_throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `<map>` | no |
|
||||
| storage_throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
|
||||
| storage_throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
|
||||
| storage_timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `<map>` | no |
|
||||
| storage_timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
|
||||
| storage_timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
|
||||
| streamanalytics_conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `<map>` | no |
|
||||
| streamanalytics_conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
|
||||
| streamanalytics_conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
|
||||
| streamanalytics_failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `<map>` | no |
|
||||
| streamanalytics_failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
|
||||
| streamanalytics_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||
| streamanalytics_failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||
| streamanalytics_runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `<map>` | no |
|
||||
| streamanalytics_runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no |
|
||||
| streamanalytics_runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no |
|
||||
| streamanalytics_status_silenced | Groups to mute for Stream Analytics status monitor | map | `<map>` | no |
|
||||
| streamanalytics_su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `<map>` | no |
|
||||
| streamanalytics_su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no |
|
||||
| streamanalytics_su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no |
|
||||
|
||||
|
||||
@ -29,13 +29,22 @@ Inputs
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `5` | no |
|
||||
| failed_requests_silenced | Groups to mute for API Management failed requests monitor | map | `<map>` | no |
|
||||
| failed_requests_threshold_critical | Maximum acceptable percent of failed requests | string | `90` | no |
|
||||
| failed_requests_threshold_warning | Warning regarding acceptable percent of failed requests | string | `50` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when a Redis monitor is triggered | string | - | yes |
|
||||
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `5` | no |
|
||||
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `90` | no |
|
||||
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `5` | no |
|
||||
| other_requests_silenced | Groups to mute for API Management other requests monitor | map | `<map>` | no |
|
||||
| other_requests_threshold_critical | Maximum acceptable percent of other requests | string | `90` | no |
|
||||
| other_requests_threshold_warning | Warning regarding acceptable percent of other requests | string | `50` | no |
|
||||
| status_silenced | Groups to mute for API Management status monitor | map | `<map>` | no |
|
||||
| successful_requests_silenced | Groups to mute for API Management successful requests monitor | map | `<map>` | no |
|
||||
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests | string | `10` | no |
|
||||
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests | string | `30` | no |
|
||||
| unauthorized_requests_silenced | Groups to mute for API Management unauthorized requests monitor | map | `<map>` | no |
|
||||
| unauthorized_requests_threshold_critical | Maximum acceptable percent of unauthorized requests | string | `90` | no |
|
||||
| unauthorized_requests_threshold_warning | Warning regarding acceptable percent of unauthorized requests | string | `50` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -25,22 +25,72 @@ variable "filter_tags_custom" {
|
||||
}
|
||||
|
||||
# Azure API Management specific
|
||||
variable "status_silenced" {
|
||||
description = "Groups to mute for API Management status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_requests_silenced" {
|
||||
description = "Groups to mute for API Management failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of failed requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of failed requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "other_requests_silenced" {
|
||||
description = "Groups to mute for API Management other requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "other_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of other requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "other_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of other requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "unauthorized_requests_silenced" {
|
||||
description = "Groups to mute for API Management unauthorized requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "unauthorized_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of unauthorized requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "unauthorized_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of unauthorized requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "successful_requests_silenced" {
|
||||
description = "Groups to mute for API Management successful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "successful_requests_threshold_critical" {
|
||||
description = "Minimum acceptable percent of successful requests"
|
||||
default = 90
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "successful_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of successful requests"
|
||||
default = 30
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_status" {
|
||||
name = "[${var.environment}] API Management status is not ok on {{name}}"
|
||||
name = "[${var.environment}] API Management is down"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -22,6 +22,8 @@ resource "datadog_monitor" "apimgt_status" {
|
||||
critical = 1
|
||||
}
|
||||
|
||||
silenced = "${var.status_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -29,7 +31,7 @@ resource "datadog_monitor" "apimgt_status" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -37,7 +39,7 @@ resource "datadog_monitor" "apimgt_status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_failed_requests" {
|
||||
name = "[${var.environment}] API Management {{name}} too much failed requests"
|
||||
name = "[${var.environment}] API Management too many failed requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -49,15 +51,18 @@ resource "datadog_monitor" "apimgt_failed_requests" {
|
||||
|
||||
thresholds {
|
||||
critical = "${var.failed_requests_threshold_critical}"
|
||||
warning = "${var.failed_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -67,7 +72,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_other_requests" {
|
||||
name = "[${var.environment}] API Management {{name}} too much other requests"
|
||||
name = "[${var.environment}] API Management too many other requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -79,15 +84,18 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
||||
|
||||
thresholds {
|
||||
critical = "${var.other_requests_threshold_critical}"
|
||||
warning = "${var.other_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.other_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -97,7 +105,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
||||
name = "[${var.environment}] API Management {{name}} too much unauthorized requests"
|
||||
name = "[${var.environment}] API Management too many unauthorized requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -109,15 +117,18 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
||||
|
||||
thresholds {
|
||||
critical = "${var.unauthorized_requests_threshold_critical}"
|
||||
warning = "${var.unauthorized_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.unauthorized_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -127,7 +138,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "apimgt_successful_requests" {
|
||||
name = "[${var.environment}] API Management {{name}} successful requests rate too low"
|
||||
name = "[${var.environment}] API Management successful requests rate too low {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -139,15 +150,18 @@ resource "datadog_monitor" "apimgt_successful_requests" {
|
||||
|
||||
thresholds {
|
||||
critical = "${var.successful_requests_threshold_critical}"
|
||||
warning = "${var.successful_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.successful_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
|
||||
@ -32,17 +32,22 @@ Inputs
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `90` | no |
|
||||
| http_successful_requests_threshold_warning | Minimum warning acceptable percent of 2xx & 3xx requests | string | `95` | no |
|
||||
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `30` | no |
|
||||
| http_4xx_requests_threshold_warning | Maximum warning acceptable percent of 4xx errors | string | `15` | no |
|
||||
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `20` | no |
|
||||
| http_5xx_requests_threshold_warning | Maximum warning acceptable percent of 5xx errors | string | `10` | no |
|
||||
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `52430000` | no |
|
||||
| memory_usage_threshold_warning | Warning threshold in MiB | string | `33550000` | no |
|
||||
| http_4xx_requests_silenced | Groups to mute for App Services 4xx requests monitor | map | `<map>` | no |
|
||||
| http_4xx_requests_threshold_critical | Maximum critical acceptable percent of 4xx errors | string | `90` | no |
|
||||
| http_4xx_requests_threshold_warning | Warning regarding acceptable percent of 4xx errors | string | `50` | no |
|
||||
| http_5xx_requests_silenced | Groups to mute for App Services 5xx requests monitor | map | `<map>` | no |
|
||||
| http_5xx_requests_threshold_critical | Maximum critical acceptable percent of 5xx errors | string | `90` | no |
|
||||
| http_5xx_requests_threshold_warning | Warning regarding acceptable percent of 5xx errors | string | `50` | no |
|
||||
| http_successful_requests_silenced | Groups to mute for App Services successful requests monitor | map | `<map>` | no |
|
||||
| http_successful_requests_threshold_critical | Minimum critical acceptable percent of 2xx & 3xx requests | string | `10` | no |
|
||||
| http_successful_requests_threshold_warning | Warning regarding acceptable percent of 2xx & 3xx requests | string | `30` | no |
|
||||
| memory_usage_silenced | Groups to mute for App Services memory usage monitor | map | `<map>` | no |
|
||||
| memory_usage_threshold_critical | Alerting threshold in Mib | string | `1073741824` | no |
|
||||
| memory_usage_threshold_warning | Warning threshold in MiB | string | `536870912` | no |
|
||||
| message | Message sent when a monitor is triggered | string | - | yes |
|
||||
| response_time_threshold_critical | Alerting threshold in seconds | string | `0.8` | no |
|
||||
| response_time_threshold_warning | Warning threshold in seconds | string | `0.4` | no |
|
||||
| response_time_silenced | Groups to mute for App Services response time monitor | map | `<map>` | no |
|
||||
| response_time_threshold_critical | Alerting threshold for response time in seconds | string | `10` | no |
|
||||
| response_time_threshold_warning | Warning threshold for response time in seconds | string | `5` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -22,72 +22,82 @@ variable "delay" {
|
||||
default = 600
|
||||
}
|
||||
|
||||
###################################
|
||||
### RESPONSE TIME VARIABLES ###
|
||||
###################################
|
||||
variable "response_time_silenced" {
|
||||
description = "Groups to mute for App Services response time monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "response_time_threshold_critical" {
|
||||
default = 0.8
|
||||
description = "Alerting threshold in seconds"
|
||||
default = 10
|
||||
description = "Alerting threshold for response time in seconds"
|
||||
}
|
||||
|
||||
variable "response_time_threshold_warning" {
|
||||
default = 0.4
|
||||
description = "Warning threshold in seconds"
|
||||
default = 5
|
||||
description = "Warning threshold for response time in seconds"
|
||||
}
|
||||
|
||||
###################################
|
||||
### MEMORY USAGE VARIABLES ###
|
||||
###################################
|
||||
variable "memory_usage_silenced" {
|
||||
description = "Groups to mute for App Services memory usage monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "memory_usage_threshold_critical" {
|
||||
default = 52430000
|
||||
default = 1073741824 # 1Gb
|
||||
description = "Alerting threshold in Mib"
|
||||
}
|
||||
|
||||
variable "memory_usage_threshold_warning" {
|
||||
default = 33550000
|
||||
default = 536870912 # 512Mb
|
||||
description = "Warning threshold in MiB"
|
||||
}
|
||||
|
||||
#################################
|
||||
### HTTP 5xx status pages ###
|
||||
#################################
|
||||
|
||||
variable "http_5xx_requests_threshold_critical" {
|
||||
default = 20
|
||||
description = "Maximum critical acceptable percent of 5xx errors"
|
||||
variable "http_4xx_requests_silenced" {
|
||||
description = "Groups to mute for App Services 4xx requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "http_5xx_requests_threshold_warning" {
|
||||
default = 10
|
||||
description = "Maximum warning acceptable percent of 5xx errors"
|
||||
}
|
||||
|
||||
#################################
|
||||
### HTTP 4xx status pages ###
|
||||
#################################
|
||||
|
||||
variable "http_4xx_requests_threshold_critical" {
|
||||
default = 30
|
||||
default = 90
|
||||
description = "Maximum critical acceptable percent of 4xx errors"
|
||||
}
|
||||
|
||||
variable "http_4xx_requests_threshold_warning" {
|
||||
default = 15
|
||||
description = "Maximum warning acceptable percent of 4xx errors"
|
||||
default = 50
|
||||
description = "Warning regarding acceptable percent of 4xx errors"
|
||||
}
|
||||
|
||||
#################################
|
||||
### HTTP 2xx status pages ###
|
||||
#################################
|
||||
variable "http_5xx_requests_silenced" {
|
||||
description = "Groups to mute for App Services 5xx requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "http_5xx_requests_threshold_critical" {
|
||||
default = 90
|
||||
description = "Maximum critical acceptable percent of 5xx errors"
|
||||
}
|
||||
|
||||
variable "http_5xx_requests_threshold_warning" {
|
||||
default = 50
|
||||
description = "Warning regarding acceptable percent of 5xx errors"
|
||||
}
|
||||
|
||||
variable "http_successful_requests_silenced" {
|
||||
description = "Groups to mute for App Services successful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "http_successful_requests_threshold_critical" {
|
||||
default = 90
|
||||
default = 10
|
||||
description = "Minimum critical acceptable percent of 2xx & 3xx requests"
|
||||
}
|
||||
|
||||
variable "http_successful_requests_threshold_warning" {
|
||||
default = 95
|
||||
description = "Minimum warning acceptable percent of 2xx & 3xx requests"
|
||||
default = 30
|
||||
description = "Warning regarding acceptable percent of 2xx & 3xx requests"
|
||||
}
|
||||
|
||||
@ -8,7 +8,7 @@ data "template_file" "filter" {
|
||||
|
||||
# Monitoring App Services response time
|
||||
resource "datadog_monitor" "appservices_response_time" {
|
||||
name = "[${var.environment}] App Services response time of {{value}}s is to high on {{name}}"
|
||||
name = "[${var.environment}] App Services response time too high {{comparator}} {{#is_alert}}{{threshold}}s{{/is_alert}}{{#is_warning}}{{warn_threshold}}s{{/is_warning}} ({{value}}s)"
|
||||
type = "metric alert"
|
||||
message = "${var.message}"
|
||||
|
||||
@ -26,9 +26,11 @@ resource "datadog_monitor" "appservices_response_time" {
|
||||
critical = "${var.response_time_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_no_data = true # Will notify when no data is received
|
||||
silenced = "${var.response_time_silenced}"
|
||||
|
||||
notify_no_data = true # Will notify when no data is received
|
||||
renotify_interval = 0
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
|
||||
@ -37,7 +39,7 @@ resource "datadog_monitor" "appservices_response_time" {
|
||||
|
||||
# Monitoring App Services memory usage
|
||||
resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||
name = "[${var.environment}] App Services memory usage > ${ceil(var.memory_usage_threshold_critical/1000000)}MiB on {{name}}"
|
||||
name = "[${var.environment}] App Services memory usage {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
type = "metric alert"
|
||||
message = "${var.message}"
|
||||
|
||||
@ -55,9 +57,11 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||
critical = "${var.memory_usage_threshold_critical}"
|
||||
}
|
||||
|
||||
notify_no_data = true # Will notify when no data is received
|
||||
silenced = "${var.memory_usage_silenced}"
|
||||
|
||||
notify_no_data = true # Will notify when no data is received
|
||||
renotify_interval = 0
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
|
||||
@ -66,7 +70,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
|
||||
|
||||
# Monitoring App Services 5xx errors percent
|
||||
resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
||||
name = "[${var.environment}] App Services HTTP 5xx errors is {{value}}% above the limit on {{name}}"
|
||||
name = "[${var.environment}] App Services HTTP 5xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
type = "metric alert"
|
||||
message = "${var.message}"
|
||||
|
||||
@ -85,9 +89,11 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
||||
critical = "${var.http_5xx_requests_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.http_5xx_requests_silenced}"
|
||||
|
||||
notify_no_data = false # Will NOT notify when no data is received
|
||||
renotify_interval = 0
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
|
||||
@ -96,7 +102,7 @@ resource "datadog_monitor" "appservices_http_5xx_errors_count" {
|
||||
|
||||
# Monitoring App Services 4xx errors percent
|
||||
resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
||||
name = "[${var.environment}] App Services HTTP 4xx errors is {{value}}% above the limit on {{name}}"
|
||||
name = "[${var.environment}] App Services HTTP 4xx errors too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
type = "metric alert"
|
||||
message = "${var.message}"
|
||||
|
||||
@ -115,9 +121,11 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
||||
critical = "${var.http_4xx_requests_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.http_4xx_requests_silenced}"
|
||||
|
||||
notify_no_data = false # Will NOT notify when no data is received
|
||||
renotify_interval = 0
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
|
||||
@ -126,7 +134,7 @@ resource "datadog_monitor" "appservices_http_4xx_errors_count" {
|
||||
|
||||
# Monitoring App Services HTTP 2xx & 3xx status pages percent
|
||||
resource "datadog_monitor" "appservices_http_success_status_rate" {
|
||||
name = "[${var.environment}] App Services HTTP successful responses is {{value}}% below the limit on {{name}}"
|
||||
name = "[${var.environment}] App Services HTTP successful responses too low {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
type = "metric alert"
|
||||
message = "${var.message}"
|
||||
|
||||
@ -146,9 +154,11 @@ resource "datadog_monitor" "appservices_http_success_status_rate" {
|
||||
critical = "${var.http_successful_requests_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.http_successful_requests_silenced}"
|
||||
|
||||
notify_no_data = false # Will notify when no data is received
|
||||
renotify_interval = 0
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
|
||||
|
||||
@ -29,22 +29,16 @@ Inputs
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `3` | no |
|
||||
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `1` | no |
|
||||
| errors_rate_silenced | Groups to mute for Event Hub errors monitor | map | `<map>` | no |
|
||||
| errors_rate_thresold_critical | Errors ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||
| errors_rate_thresold_warning | Errors ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||
| failed_requests_rate_silenced | Groups to mute for Event Hub failed requests monitor | map | `<map>` | no |
|
||||
| failed_requests_rate_thresold_critical | Failed requests ratio (percentage) to trigger the critical alert | string | `90` | no |
|
||||
| failed_requests_rate_thresold_warning | Failed requests ratio (percentage) to trigger a warning alert | string | `50` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
|
||||
Outputs
|
||||
-------
|
||||
|
||||
| Name | Description |
|
||||
|------|-------------|
|
||||
| errors_monitor_id | Id of the `errors` monitor |
|
||||
| failed_requests_monitor_id | Id of the `failed requests` monitor |
|
||||
| status_monitor_id | Id of the `status` monitor |
|
||||
| status_silenced | Groups to mute for Event Hub status monitor | map | `<map>` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -24,22 +24,40 @@ variable "filter_tags_custom" {
|
||||
default = "*"
|
||||
}
|
||||
|
||||
variable "status_silenced" {
|
||||
description = "Groups to mute for Event Hub status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_requests_rate_silenced" {
|
||||
description = "Groups to mute for Event Hub failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_requests_rate_thresold_critical" {
|
||||
description = "Failed requests ratio (percentage) to trigger the critical alert"
|
||||
default = 3
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_requests_rate_thresold_warning" {
|
||||
description = "Failed requests ratio (percentage) to trigger a warning alert"
|
||||
default = 1
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "errors_rate_silenced" {
|
||||
description = "Groups to mute for Event Hub errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "errors_rate_thresold_critical" {
|
||||
description = "Errors ratio (percentage) to trigger the critical alert"
|
||||
default = 3
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "errors_rate_thresold_warning" {
|
||||
description = "Errors ratio (percentage) to trigger a warning alert"
|
||||
default = 1
|
||||
default = 50
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventhub_status" {
|
||||
name = "[${var.environment}] Event Hub status is not ok on {{name}}"
|
||||
name = "[${var.environment}] Event Hub is down"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -16,6 +16,8 @@ resource "datadog_monitor" "eventhub_status" {
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.status_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -23,7 +25,7 @@ resource "datadog_monitor" "eventhub_status" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -31,7 +33,7 @@ resource "datadog_monitor" "eventhub_status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
name = "[${var.environment}] Event Hub too much failed requests on {{name}}"
|
||||
name = "[${var.environment}] Event Hub too many failed requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -50,6 +52,8 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
warning = "${var.failed_requests_rate_thresold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_requests_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -57,7 +61,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -65,7 +69,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "eventhub_errors" {
|
||||
name = "[${var.environment}] Event Hub too much errors on {{name}}"
|
||||
name = "[${var.environment}] Event Hub too manny errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -88,6 +92,8 @@ resource "datadog_monitor" "eventhub_errors" {
|
||||
warning = "${var.errors_rate_thresold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.errors_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -95,7 +101,7 @@ resource "datadog_monitor" "eventhub_errors" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
output "status_monitor_id" {
|
||||
value = "${datadog_monitor.eventhub_failed_requests.id}"
|
||||
}
|
||||
|
||||
output "failed_requests_monitor_id" {
|
||||
value = "${datadog_monitor.eventhub_status.id}"
|
||||
}
|
||||
|
||||
output "errors_monitor_id" {
|
||||
value = "${datadog_monitor.eventhub_errors.id}"
|
||||
}
|
||||
@ -29,177 +29,347 @@ variable "non_taggable_filter_tags" {
|
||||
}
|
||||
|
||||
# Azure API Management specific variables
|
||||
variable "apimanagement_status_silenced" {
|
||||
description = "Groups to mute for API Management status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "apimanagement_failed_requests_silenced" {
|
||||
description = "Groups to mute for API Management failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "apimanagement_failed_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of failed requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "apimanagement_failed_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of failed requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "apimanagement_other_requests_silenced" {
|
||||
description = "Groups to mute for API Management other requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "apimanagement_other_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of other requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "apimanagement_other_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of other requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "apimanagement_unauthorized_requests_silenced" {
|
||||
description = "Groups to mute for API Management unauthorized requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "apimanagement_unauthorized_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of unauthorized requests"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "apimanagement_unauthorized_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of unauthorized requests"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "apimanagement_successful_requests_silenced" {
|
||||
description = "Groups to mute for API Management successful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "apimanagement_successful_requests_threshold_critical" {
|
||||
description = "Minimum acceptable percent of successful requests"
|
||||
default = 90
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "apimanagement_successful_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of successful requests"
|
||||
default = 30
|
||||
}
|
||||
|
||||
# Azure App Services specific variables
|
||||
variable "appservices_response_time_silenced" {
|
||||
description = "Groups to mute for App Services response time monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "appservices_response_time_threshold_critical" {
|
||||
default = 0.8
|
||||
description = "Alerting threshold in seconds"
|
||||
default = 10
|
||||
description = "Alerting threshold for response time in seconds"
|
||||
}
|
||||
|
||||
variable "appservices_response_time_threshold_warning" {
|
||||
default = 0.4
|
||||
description = "Warning threshold in seconds"
|
||||
default = 5
|
||||
description = "Warning threshold for response time in seconds"
|
||||
}
|
||||
|
||||
variable "appservices_memory_usage_silenced" {
|
||||
description = "Groups to mute for App Services memory usage monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "appservices_memory_usage_threshold_critical" {
|
||||
default = 52430000
|
||||
default = 1073741824 # 1Gb
|
||||
description = "Alerting threshold in Mib"
|
||||
}
|
||||
|
||||
variable "appservices_memory_usage_threshold_warning" {
|
||||
default = 33550000
|
||||
default = 536870912 # 512Mb
|
||||
description = "Warning threshold in MiB"
|
||||
}
|
||||
|
||||
variable "appservices_http_4xx_requests_silenced" {
|
||||
description = "Groups to mute for App Services 4xx requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "appservices_http_4xx_requests_threshold_critical" {
|
||||
default = 30
|
||||
default = 90
|
||||
description = "Maximum critical acceptable percent of 4xx errors"
|
||||
}
|
||||
|
||||
variable "appservices_http_4xx_requests_threshold_warning" {
|
||||
default = 15
|
||||
description = "Maximum warning acceptable percent of 4xx errors"
|
||||
default = 50
|
||||
description = "Warning regarding acceptable percent of 4xx errors"
|
||||
}
|
||||
|
||||
variable "appservices_http_5xx_requests_silenced" {
|
||||
description = "Groups to mute for App Services 5xx requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "appservices_http_5xx_requests_threshold_critical" {
|
||||
default = 20
|
||||
default = 90
|
||||
description = "Maximum critical acceptable percent of 5xx errors"
|
||||
}
|
||||
|
||||
variable "appservices_http_5xx_requests_threshold_warning" {
|
||||
default = 10
|
||||
description = "Maximum warning acceptable percent of 5xx errors"
|
||||
default = 50
|
||||
description = "Warning regarding acceptable percent of 5xx errors"
|
||||
}
|
||||
|
||||
variable "appservices_http_successful_requests_silenced" {
|
||||
description = "Groups to mute for App Services successful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "appservices_http_successful_requests_threshold_critical" {
|
||||
default = 90
|
||||
default = 10
|
||||
description = "Minimum critical acceptable percent of 2xx & 3xx requests"
|
||||
}
|
||||
|
||||
variable "appservices_http_successful_requests_threshold_warning" {
|
||||
default = 95
|
||||
description = "Minimum warning acceptable percent of 2xx & 3xx requests"
|
||||
default = 30
|
||||
description = "Warning regarding acceptable percent of 2xx & 3xx requests"
|
||||
}
|
||||
|
||||
# Azure Event Hub specific variables
|
||||
variable "eventhub_status_silenced" {
|
||||
description = "Groups to mute for Event Hub status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "eventhub_failed_requests_rate_silenced" {
|
||||
description = "Groups to mute for Event Hub failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "eventhub_failed_requests_rate_thresold_critical" {
|
||||
description = "Failed requests ratio (percentage) to trigger the critical alert"
|
||||
default = 3
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "eventhub_failed_requests_rate_thresold_warning" {
|
||||
description = "Failed requests ratio (percentage) to trigger a warning alert"
|
||||
default = 1
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "eventhub_errors_rate_silenced" {
|
||||
description = "Groups to mute for Event Hub errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "eventhub_errors_rate_thresold_critical" {
|
||||
description = "Errors ratio (percentage) to trigger the critical alert"
|
||||
default = 3
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "eventhub_errors_rate_thresold_warning" {
|
||||
description = "Errors ratio (percentage) to trigger a warning alert"
|
||||
default = 1
|
||||
default = 50
|
||||
}
|
||||
|
||||
# IOT Hub specific variables
|
||||
variable "iothub_status_silenced" {
|
||||
description = "Groups to mute for IoT Hub status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_total_devices_silenced" {
|
||||
description = "Groups to mute for IoT Hub total device monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_too_many_d2c_telemetry_ingress_nosent_silenced" {
|
||||
description = "Groups to mute for IoT Hub unsent d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_jobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_jobs_rate_threshold_warning" {
|
||||
description = "Jobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_jobs_rate_threshold_critical" {
|
||||
description = "Jobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_listjobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed list jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_listjobs_rate_threshold_warning" {
|
||||
description = "ListJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_listjobs_rate_threshold_critical" {
|
||||
description = "ListJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_queryjobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed query jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_queryjobs_rate_threshold_warning" {
|
||||
description = "QueryJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_queryjobs_rate_threshold_critical" {
|
||||
description = "QueryJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_methods_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d methods monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_methods_rate_threshold_warning" {
|
||||
description = "C2D Methods Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_methods_rate_threshold_critical" {
|
||||
description = "C2D Methods Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_read_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d twin read monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_read_rate_threshold_warning" {
|
||||
description = "C2D Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_read_rate_threshold_critical" {
|
||||
description = "C2D Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_update_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d twin update monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_update_rate_threshold_warning" {
|
||||
description = "C2D Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_c2d_twin_update_rate_threshold_critical" {
|
||||
description = "C2D Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_read_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed d2c twin read monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_read_rate_threshold_warning" {
|
||||
description = "D2C Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_read_rate_threshold_critical" {
|
||||
description = "D2C Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_update_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed d2c twin update monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_update_rate_threshold_warning" {
|
||||
description = "D2C Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "iothub_failed_d2c_twin_update_rate_threshold_critical" {
|
||||
description = "D2C Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "iothub_dropped_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub dropped d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_dropped_d2c_telemetry_egress_threshold_warning" {
|
||||
@ -212,6 +382,12 @@ variable "iothub_dropped_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "iothub_orphaned_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub orphaned d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_orphaned_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Orphaned limit (warning threshold)"
|
||||
default = 500
|
||||
@ -222,6 +398,12 @@ variable "iothub_orphaned_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "iothub_invalid_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub invalid d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_invalid_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Invalid limit (warning threshold)"
|
||||
default = 500
|
||||
@ -232,6 +414,12 @@ variable "iothub_invalid_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "iothub_fallback_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub fallback d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "iothub_fallback_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Fallback limit (warning threshold)"
|
||||
default = 500
|
||||
@ -243,6 +431,18 @@ variable "iothub_fallback_d2c_telemetry_egress_threshold_critical" {
|
||||
}
|
||||
|
||||
# Azure Redis specific variables
|
||||
variable "redis_status_silenced" {
|
||||
description = "Groups to mute for Redis status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "redis_evictedkeys_limit_silenced" {
|
||||
description = "Groups to mute for Redis evicted keys monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "redis_evictedkeys_limit_threshold_warning" {
|
||||
description = "Evicted keys limit (warning threshold)"
|
||||
default = 0
|
||||
@ -253,6 +453,12 @@ variable "redis_evictedkeys_limit_threshold_critical" {
|
||||
default = 100
|
||||
}
|
||||
|
||||
variable "redis_percent_processor_time_silenced" {
|
||||
description = "Groups to mute for Redis processor monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "redis_percent_processor_time_threshold_critical" {
|
||||
description = "Processor time percent (critical threshold)"
|
||||
default = 80
|
||||
@ -263,6 +469,12 @@ variable "redis_percent_processor_time_threshold_warning" {
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "redis_server_load_rate_silenced" {
|
||||
description = "Groups to mute for Redis server load monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "redis_server_load_rate_threshold_critical" {
|
||||
description = "Server CPU load rate (critical threshold)"
|
||||
default = 90
|
||||
@ -274,6 +486,12 @@ variable "redis_server_load_rate_threshold_warning" {
|
||||
}
|
||||
|
||||
# Azure SQL Database specific variables
|
||||
variable "sqldatabase_cpu_silenced" {
|
||||
description = "Groups to mute for SQL CPU monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "sqldatabase_cpu_threshold_warning" {
|
||||
description = "CPU usage in percent (warning threshold)"
|
||||
default = "80"
|
||||
@ -284,6 +502,12 @@ variable "sqldatabase_cpu_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "sqldatabase_diskspace_silenced" {
|
||||
description = "Groups to mute for SQL disk space monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "sqldatabase_diskspace_threshold_warning" {
|
||||
description = "Disk space used in percent (warning threshold)"
|
||||
default = "80"
|
||||
@ -294,6 +518,12 @@ variable "sqldatabase_diskspace_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "sqldatabase_dtu_silenced" {
|
||||
description = "Groups to mute for SQL DTU monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "sqldatabase_dtu_threshold_warning" {
|
||||
description = "Amount of DTU used (warning threshold)"
|
||||
default = "85"
|
||||
@ -304,58 +534,175 @@ variable "sqldatabase_dtu_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "sqldatabase_deadlock_silenced" {
|
||||
description = "Groups to mute for SQL Deadlock monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "sqldatabase_deadlock_threshold_critical" {
|
||||
description = "Amount of Deadlocks (critical threshold)"
|
||||
default = "1"
|
||||
}
|
||||
|
||||
# Azure Storage specific variables
|
||||
variable "storage_availability_silenced" {
|
||||
description = "Groups to mute for Storage availability monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_availability_threshold_critical" {
|
||||
description = "Minimum acceptable percent of availability for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_availability_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of availability for a storage"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_successful_requests_silenced" {
|
||||
description = "Groups to mute for Storage sucessful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_successful_requests_threshold_critical" {
|
||||
description = "Minimum acceptable percent of successful requests for a storage"
|
||||
default = 90
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "storage_successful_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of successful requests for a storage"
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "storage_latency_silenced" {
|
||||
description = "Groups to mute for Storage latency monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_latency_threshold_critical" {
|
||||
description = "Maximum acceptable end to end latency (ms) for a storage"
|
||||
default = 2000
|
||||
}
|
||||
|
||||
variable "storage_latency_threshold_warning" {
|
||||
description = "Warning regarding acceptable end to end latency (ms) for a storage"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "storage_timeout_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage timeout monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_timeout_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of timeout error requests for a storage"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_timeout_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of timeout error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_network_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage network errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_network_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of network error requests for a storage"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_network_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of network error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_throttling_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage throttling error monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_throttling_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of throttling error requests for a storage"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_throttling_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of throttling error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_server_other_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage server other errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_server_other_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of server other error requests for a storage"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_server_other_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of server other error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_client_other_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage other errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_client_other_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of client other error requests for a storage"
|
||||
default = 15
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_client_other_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of client other error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "storage_authorization_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage authorization errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "storage_authorization_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of authorization error requests for a storage"
|
||||
default = 15
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "storage_authorization_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of authorization error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
# Azure Stream Analytics specific variables
|
||||
variable "streamanalytics_status_silenced" {
|
||||
description = "Groups to mute for Stream Analytics status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "streamanalytics_su_utilization_silenced" {
|
||||
description = "Groups to mute for Stream Analytics utilization monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "streamanalytics_su_utilization_threshold_warning" {
|
||||
description = "Streaming Unit utilization rate limit (warning threshold)"
|
||||
default = 60
|
||||
@ -366,7 +713,13 @@ variable "streamanalytics_su_utilization_threshold_critical" {
|
||||
default = 80
|
||||
}
|
||||
|
||||
variable "streamanalytics_function_requests_threshold_warning" {
|
||||
variable "streamanalytics_failed_function_requests_silenced" {
|
||||
description = "Groups to mute for Stream Analytics failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "streamanalytics_failed_function_requests_threshold_warning" {
|
||||
description = "Failed Function Request rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
@ -376,6 +729,12 @@ variable "streamanalytics_failed_function_requests_threshold_critical" {
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "streamanalytics_conversion_errors_silenced" {
|
||||
description = "Groups to mute for Stream Analytics conversion errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "streamanalytics_conversion_errors_threshold_warning" {
|
||||
description = "Conversion errors limit (warning threshold)"
|
||||
default = 0
|
||||
@ -386,6 +745,12 @@ variable "streamanalytics_conversion_errors_threshold_critical" {
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "streamanalytics_runtime_errors_silenced" {
|
||||
description = "Groups to mute for Stream Analytics runtime errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "streamanalytics_runtime_errors_threshold_warning" {
|
||||
description = "Runtime errors limit (warning threshold)"
|
||||
default = 0
|
||||
|
||||
@ -39,33 +39,48 @@ Inputs
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| dropped_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub dropped d2c telemetry monitor | map | `<map>` | no |
|
||||
| dropped_d2c_telemetry_egress_threshold_critical | D2C Telemetry Dropped limit (critical threshold) | string | `1000` | no |
|
||||
| dropped_d2c_telemetry_egress_threshold_warning | D2C Telemetry Dropped limit (warning threshold) | string | `500` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `0` | no |
|
||||
| failed_c2d_methods_rate_silenced | Groups to mute for IoT Hub failed c2d methods monitor | map | `<map>` | no |
|
||||
| failed_c2d_methods_rate_threshold_critical | C2D Methods Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_c2d_methods_rate_threshold_warning | C2D Methods Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_c2d_twin_read_rate_silenced | Groups to mute for IoT Hub failed c2d twin read monitor | map | `<map>` | no |
|
||||
| failed_c2d_twin_read_rate_threshold_critical | C2D Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_c2d_twin_read_rate_threshold_warning | C2D Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_c2d_twin_update_rate_silenced | Groups to mute for IoT Hub failed c2d twin update monitor | map | `<map>` | no |
|
||||
| failed_c2d_twin_update_rate_threshold_critical | C2D Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_c2d_twin_update_rate_threshold_warning | C2D Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_d2c_twin_read_rate_silenced | Groups to mute for IoT Hub failed d2c twin read monitor | map | `<map>` | no |
|
||||
| failed_d2c_twin_read_rate_threshold_critical | D2C Twin Read Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_d2c_twin_read_rate_threshold_warning | D2C Twin Read Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_d2c_twin_update_rate_silenced | Groups to mute for IoT Hub failed d2c twin update monitor | map | `<map>` | no |
|
||||
| failed_d2c_twin_update_rate_threshold_critical | D2C Twin Update Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_d2c_twin_update_rate_threshold_warning | D2C Twin Update Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_jobs_rate_silenced | Groups to mute for IoT Hub failed jobs monitor | map | `<map>` | no |
|
||||
| failed_jobs_rate_threshold_critical | Jobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_jobs_rate_threshold_warning | Jobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_listjobs_rate_silenced | Groups to mute for IoT Hub failed list jobs monitor | map | `<map>` | no |
|
||||
| failed_listjobs_rate_threshold_critical | ListJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_listjobs_rate_threshold_warning | ListJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| failed_queryjobs_rate_silenced | Groups to mute for IoT Hub failed query jobs monitor | map | `<map>` | no |
|
||||
| failed_queryjobs_rate_threshold_critical | QueryJobs Failed rate limit (critical threshold) | string | `90` | no |
|
||||
| failed_queryjobs_rate_threshold_warning | QueryJobs Failed rate limit (warning threshold) | string | `50` | no |
|
||||
| fallback_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub fallback d2c telemetry monitor | map | `<map>` | no |
|
||||
| fallback_d2c_telemetry_egress_threshold_critical | D2C Telemetry Fallback limit (critical threshold) | string | `1000` | no |
|
||||
| fallback_d2c_telemetry_egress_threshold_warning | D2C Telemetry Fallback limit (warning threshold) | string | `500` | no |
|
||||
| filter_tags | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags | Tags used for filtering | string | `*` | no |
|
||||
| invalid_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub invalid d2c telemetry monitor | map | `<map>` | no |
|
||||
| invalid_d2c_telemetry_egress_threshold_critical | D2C Telemetry Invalid limit (critical threshold) | string | `1000` | no |
|
||||
| invalid_d2c_telemetry_egress_threshold_warning | D2C Telemetry Invalid limit (warning threshold) | string | `500` | no |
|
||||
| message | Message sent when an alert is triggered | string | - | yes |
|
||||
| orphaned_d2c_telemetry_egress_silenced | Groups to mute for IoT Hub orphaned d2c telemetry monitor | map | `<map>` | no |
|
||||
| orphaned_d2c_telemetry_egress_threshold_critical | D2C Telemetry Orphaned limit (critical threshold) | string | `1000` | no |
|
||||
| orphaned_d2c_telemetry_egress_threshold_warning | D2C Telemetry Orphaned limit (warning threshold) | string | `500` | no |
|
||||
| status_silenced | Groups to mute for IoT Hub status monitor | map | `<map>` | no |
|
||||
| too_many_d2c_telemetry_ingress_nosent_silenced | Groups to mute for IoT Hub unsent d2c telemetry monitor | map | `<map>` | no |
|
||||
| total_devices_silenced | Groups to mute for IoT Hub total device monitor | map | `<map>` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -20,84 +20,156 @@ variable "filter_tags" {
|
||||
}
|
||||
|
||||
# Azure IOT hubs specific
|
||||
variable "status_silenced" {
|
||||
description = "Groups to mute for IoT Hub status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "total_devices_silenced" {
|
||||
description = "Groups to mute for IoT Hub total device monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "too_many_d2c_telemetry_ingress_nosent_silenced" {
|
||||
description = "Groups to mute for IoT Hub unsent d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_jobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_jobs_rate_threshold_warning" {
|
||||
description = "Jobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_jobs_rate_threshold_critical" {
|
||||
description = "Jobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_listjobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed list jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_listjobs_rate_threshold_warning" {
|
||||
description = "ListJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_listjobs_rate_threshold_critical" {
|
||||
description = "ListJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_queryjobs_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed query jobs monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_queryjobs_rate_threshold_warning" {
|
||||
description = "QueryJobs Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_queryjobs_rate_threshold_critical" {
|
||||
description = "QueryJobs Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_c2d_methods_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d methods monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_c2d_methods_rate_threshold_warning" {
|
||||
description = "C2D Methods Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_c2d_methods_rate_threshold_critical" {
|
||||
description = "C2D Methods Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_read_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d twin read monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_read_rate_threshold_warning" {
|
||||
description = "C2D Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_read_rate_threshold_critical" {
|
||||
description = "C2D Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_update_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed c2d twin update monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_update_rate_threshold_warning" {
|
||||
description = "C2D Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_c2d_twin_update_rate_threshold_critical" {
|
||||
description = "C2D Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_read_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed d2c twin read monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_read_rate_threshold_warning" {
|
||||
description = "D2C Twin Read Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_read_rate_threshold_critical" {
|
||||
description = "D2C Twin Read Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_update_rate_silenced" {
|
||||
description = "Groups to mute for IoT Hub failed d2c twin update monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_update_rate_threshold_warning" {
|
||||
description = "D2C Twin Update Failed rate limit (warning threshold)"
|
||||
default = 0
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "failed_d2c_twin_update_rate_threshold_critical" {
|
||||
description = "D2C Twin Update Failed rate limit (critical threshold)"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "dropped_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub dropped d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "dropped_d2c_telemetry_egress_threshold_warning" {
|
||||
@ -110,6 +182,12 @@ variable "dropped_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "orphaned_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub orphaned d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "orphaned_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Orphaned limit (warning threshold)"
|
||||
default = 500
|
||||
@ -120,6 +198,12 @@ variable "orphaned_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "invalid_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub invalid d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "invalid_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Invalid limit (warning threshold)"
|
||||
default = 500
|
||||
@ -130,6 +214,12 @@ variable "invalid_d2c_telemetry_egress_threshold_critical" {
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "fallback_d2c_telemetry_egress_silenced" {
|
||||
description = "Groups to mute for IoT Hub fallback d2c telemetry monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "fallback_d2c_telemetry_egress_threshold_warning" {
|
||||
description = "D2C Telemetry Fallback limit (warning threshold)"
|
||||
default = 500
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many jobs failed on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many jobs failed {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -17,6 +17,8 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
critical = "${var.failed_jobs_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_jobs_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -24,7 +26,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -32,7 +34,7 @@ resource "datadog_monitor" "too_many_jobs_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many list_jobs failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -50,6 +52,8 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
critical = "${var.failed_listjobs_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_listjobs_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -57,7 +61,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -65,7 +69,7 @@ resource "datadog_monitor" "too_many_list_jobs_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many query_jobs failed {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -83,6 +87,8 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
critical = "${var.failed_queryjobs_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_queryjobs_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -90,7 +96,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -98,7 +104,7 @@ resource "datadog_monitor" "too_many_query_jobs_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "status" {
|
||||
name = "[${var.environment}] IOT Hub Status is not ok on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub is down"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -107,6 +113,8 @@ resource "datadog_monitor" "status" {
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.status_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -114,7 +122,7 @@ resource "datadog_monitor" "status" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -122,7 +130,7 @@ resource "datadog_monitor" "status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "total_devices" {
|
||||
name = "[${var.environment}] IOT Hub Total devices is wrong on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Total devices is wrong {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -131,6 +139,8 @@ resource "datadog_monitor" "total_devices" {
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.total_devices_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -138,7 +148,7 @@ resource "datadog_monitor" "total_devices" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -146,7 +156,7 @@ resource "datadog_monitor" "total_devices" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d methods failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -164,6 +174,8 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
critical = "${var.failed_c2d_methods_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_c2d_methods_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -171,7 +183,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -179,7 +191,7 @@ resource "datadog_monitor" "too_many_c2d_methods_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin read failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -197,6 +209,8 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
critical = "${var.failed_c2d_twin_read_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_c2d_twin_read_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -204,7 +218,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -212,7 +226,7 @@ resource "datadog_monitor" "too_many_c2d_twin_read_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many c2d twin update failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -230,6 +244,8 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
critical = "${var.failed_c2d_twin_update_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_c2d_twin_update_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -237,7 +253,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -245,7 +261,7 @@ resource "datadog_monitor" "too_many_c2d_twin_update_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin read failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -263,6 +279,8 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
critical = "${var.failed_d2c_twin_read_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_d2c_twin_read_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -270,7 +288,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -278,7 +296,7 @@ resource "datadog_monitor" "too_many_d2c_twin_read_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c twin update failure {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -296,6 +314,8 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
critical = "${var.failed_d2c_twin_update_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_d2c_twin_update_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -303,7 +323,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -311,7 +331,7 @@ resource "datadog_monitor" "too_many_d2c_twin_update_failed" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress dropped {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -327,6 +347,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
critical = "${var.dropped_d2c_telemetry_egress_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.dropped_d2c_telemetry_egress_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -334,7 +356,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -342,7 +364,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_dropped" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress orphaned {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -358,6 +380,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
critical = "${var.orphaned_d2c_telemetry_egress_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.orphaned_d2c_telemetry_egress_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -365,7 +389,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -373,7 +397,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_orphaned" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress invalid {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -389,6 +413,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
critical = "${var.invalid_d2c_telemetry_egress_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.invalid_d2c_telemetry_egress_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -396,7 +422,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -404,7 +430,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_invalid" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry egress fallback {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -420,6 +446,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
critical = "${var.fallback_d2c_telemetry_egress_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.fallback_d2c_telemetry_egress_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -427,7 +455,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -435,7 +463,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_egress_fallback" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress no sent on {{name}}"
|
||||
name = "[${var.environment}] IOT Hub Too many d2c telemetry ingress not sent {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -447,6 +475,8 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.too_many_d2c_telemetry_ingress_nosent_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -454,7 +484,7 @@ resource "datadog_monitor" "too_many_d2c_telemetry_ingress_nosent" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
|
||||
@ -8,10 +8,19 @@ module "apimanagement" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
status_silenced = "${var.apimanagement_status_silenced}"
|
||||
failed_requests_silenced = "${var.apimanagement_failed_requests_silenced}"
|
||||
failed_requests_threshold_critical = "${var.apimanagement_failed_requests_threshold_critical}"
|
||||
failed_requests_threshold_warning = "${var.apimanagement_failed_requests_threshold_warning}"
|
||||
other_requests_silenced = "${var.apimanagement_other_requests_silenced}"
|
||||
other_requests_threshold_critical = "${var.apimanagement_other_requests_threshold_critical}"
|
||||
other_requests_threshold_warning = "${var.apimanagement_other_requests_threshold_warning}"
|
||||
successful_requests_silenced = "${var.apimanagement_successful_requests_silenced}"
|
||||
successful_requests_threshold_critical = "${var.apimanagement_successful_requests_threshold_critical}"
|
||||
successful_requests_threshold_warning = "${var.apimanagement_successful_requests_threshold_warning}"
|
||||
unauthorized_requests_silenced = "${var.apimanagement_unauthorized_requests_silenced}"
|
||||
unauthorized_requests_threshold_critical = "${var.apimanagement_unauthorized_requests_threshold_critical}"
|
||||
unauthorized_requests_threshold_warning = "${var.apimanagement_unauthorized_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
module "appservices" {
|
||||
@ -24,14 +33,19 @@ module "appservices" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
http_successful_requests_silenced = "${var.appservices_http_successful_requests_silenced}"
|
||||
http_successful_requests_threshold_critical = "${var.appservices_http_successful_requests_threshold_critical}"
|
||||
http_successful_requests_threshold_warning = "${var.appservices_http_successful_requests_threshold_warning}"
|
||||
http_5xx_requests_silenced = "${var.appservices_http_5xx_requests_silenced}"
|
||||
http_5xx_requests_threshold_critical = "${var.appservices_http_5xx_requests_threshold_critical}"
|
||||
http_5xx_requests_threshold_warning = "${var.appservices_http_5xx_requests_threshold_warning}"
|
||||
http_4xx_requests_silenced = "${var.appservices_http_4xx_requests_silenced}"
|
||||
http_4xx_requests_threshold_critical = "${var.appservices_http_4xx_requests_threshold_critical}"
|
||||
http_4xx_requests_threshold_warning = "${var.appservices_http_4xx_requests_threshold_warning}"
|
||||
memory_usage_silenced = "${var.appservices_memory_usage_silenced}"
|
||||
memory_usage_threshold_critical = "${var.appservices_memory_usage_threshold_critical}"
|
||||
memory_usage_threshold_warning = "${var.appservices_memory_usage_threshold_warning}"
|
||||
response_time_silenced = "${var.appservices_response_time_silenced}"
|
||||
response_time_threshold_critical = "${var.appservices_response_time_threshold_critical}"
|
||||
response_time_threshold_warning = "${var.appservices_response_time_threshold_warning}"
|
||||
}
|
||||
@ -46,8 +60,11 @@ module "eventhub" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
status_silenced = "${var.eventhub_status_silenced}"
|
||||
errors_rate_silenced = "${var.eventhub_errors_rate_silenced}"
|
||||
errors_rate_thresold_critical = "${var.eventhub_errors_rate_thresold_critical}"
|
||||
errors_rate_thresold_warning = "${var.eventhub_errors_rate_thresold_warning}"
|
||||
failed_requests_rate_silenced = "${var.eventhub_failed_requests_rate_silenced}"
|
||||
failed_requests_rate_thresold_critical = "${var.eventhub_failed_requests_rate_thresold_critical}"
|
||||
failed_requests_rate_thresold_warning = "${var.eventhub_failed_requests_rate_thresold_warning}"
|
||||
}
|
||||
@ -61,28 +78,43 @@ module "iothub" {
|
||||
|
||||
filter_tags = "${var.non_taggable_filter_tags}"
|
||||
|
||||
status_silenced = "${var.iothub_status_silenced}"
|
||||
total_devices_silenced = "${var.iothub_total_devices_silenced}"
|
||||
too_many_d2c_telemetry_ingress_nosent_silenced = "${var.iothub_too_many_d2c_telemetry_ingress_nosent_silenced}"
|
||||
dropped_d2c_telemetry_egress_silenced = "${var.iothub_dropped_d2c_telemetry_egress_silenced}"
|
||||
dropped_d2c_telemetry_egress_threshold_critical = "${var.iothub_dropped_d2c_telemetry_egress_threshold_critical}"
|
||||
dropped_d2c_telemetry_egress_threshold_warning = "${var.iothub_dropped_d2c_telemetry_egress_threshold_warning}"
|
||||
failed_c2d_methods_rate_silenced = "${var.iothub_failed_c2d_methods_rate_silenced}"
|
||||
failed_c2d_methods_rate_threshold_critical = "${var.iothub_failed_c2d_methods_rate_threshold_critical}"
|
||||
failed_c2d_methods_rate_threshold_warning = "${var.iothub_failed_c2d_methods_rate_threshold_warning}"
|
||||
failed_c2d_twin_read_rate_silenced = "${var.iothub_failed_c2d_twin_read_rate_silenced}"
|
||||
failed_c2d_twin_read_rate_threshold_critical = "${var.iothub_failed_c2d_twin_read_rate_threshold_critical}"
|
||||
failed_c2d_twin_read_rate_threshold_warning = "${var.iothub_failed_c2d_twin_read_rate_threshold_warning}"
|
||||
failed_c2d_twin_update_rate_silenced = "${var.iothub_failed_c2d_twin_update_rate_silenced}"
|
||||
failed_c2d_twin_update_rate_threshold_critical = "${var.iothub_failed_c2d_twin_update_rate_threshold_critical}"
|
||||
failed_c2d_twin_update_rate_threshold_warning = "${var.iothub_failed_c2d_twin_update_rate_threshold_warning}"
|
||||
failed_d2c_twin_read_rate_silenced = "${var.iothub_failed_d2c_twin_read_rate_silenced}"
|
||||
failed_d2c_twin_read_rate_threshold_critical = "${var.iothub_failed_d2c_twin_read_rate_threshold_critical}"
|
||||
failed_d2c_twin_read_rate_threshold_warning = "${var.iothub_failed_d2c_twin_read_rate_threshold_warning}"
|
||||
failed_d2c_twin_update_rate_silenced = "${var.iothub_failed_d2c_twin_update_rate_silenced}"
|
||||
failed_d2c_twin_update_rate_threshold_critical = "${var.iothub_failed_d2c_twin_update_rate_threshold_critical}"
|
||||
failed_d2c_twin_update_rate_threshold_warning = "${var.iothub_failed_d2c_twin_update_rate_threshold_warning}"
|
||||
failed_jobs_rate_silenced = "${var.iothub_failed_jobs_rate_silenced}"
|
||||
failed_jobs_rate_threshold_critical = "${var.iothub_failed_jobs_rate_threshold_critical}"
|
||||
failed_jobs_rate_threshold_warning = "${var.iothub_failed_jobs_rate_threshold_warning}"
|
||||
failed_listjobs_rate_silenced = "${var.iothub_failed_listjobs_rate_silenced}"
|
||||
failed_listjobs_rate_threshold_critical = "${var.iothub_failed_listjobs_rate_threshold_critical}"
|
||||
failed_listjobs_rate_threshold_warning = "${var.iothub_failed_listjobs_rate_threshold_warning}"
|
||||
failed_queryjobs_rate_silenced = "${var.iothub_failed_queryjobs_rate_silenced}"
|
||||
failed_queryjobs_rate_threshold_critical = "${var.iothub_failed_queryjobs_rate_threshold_critical}"
|
||||
failed_queryjobs_rate_threshold_warning = "${var.iothub_failed_queryjobs_rate_threshold_warning}"
|
||||
fallback_d2c_telemetry_egress_silenced = "${var.iothub_fallback_d2c_telemetry_egress_silenced}"
|
||||
fallback_d2c_telemetry_egress_threshold_critical = "${var.iothub_fallback_d2c_telemetry_egress_threshold_critical}"
|
||||
fallback_d2c_telemetry_egress_threshold_warning = "${var.iothub_fallback_d2c_telemetry_egress_threshold_warning}"
|
||||
invalid_d2c_telemetry_egress_silenced = "${var.iothub_invalid_d2c_telemetry_egress_silenced}"
|
||||
invalid_d2c_telemetry_egress_threshold_critical = "${var.iothub_invalid_d2c_telemetry_egress_threshold_critical}"
|
||||
invalid_d2c_telemetry_egress_threshold_warning = "${var.iothub_invalid_d2c_telemetry_egress_threshold_warning}"
|
||||
orphaned_d2c_telemetry_egress_silenced = "${var.iothub_orphaned_d2c_telemetry_egress_silenced}"
|
||||
orphaned_d2c_telemetry_egress_threshold_critical = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_critical}"
|
||||
orphaned_d2c_telemetry_egress_threshold_warning = "${var.iothub_orphaned_d2c_telemetry_egress_threshold_warning}"
|
||||
}
|
||||
@ -97,10 +129,14 @@ module "redis" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
status_silenced = "${var.redis_status_silenced}"
|
||||
evictedkeys_limit_silenced = "${var.redis_evictedkeys_limit_silenced}"
|
||||
evictedkeys_limit_threshold_critical = "${var.redis_evictedkeys_limit_threshold_critical}"
|
||||
evictedkeys_limit_threshold_warning = "${var.redis_evictedkeys_limit_threshold_warning}"
|
||||
percent_processor_time_silenced = "${var.redis_percent_processor_time_silenced}"
|
||||
percent_processor_time_threshold_critical = "${var.redis_percent_processor_time_threshold_critical}"
|
||||
percent_processor_time_threshold_warning = "${var.redis_percent_processor_time_threshold_warning}"
|
||||
server_load_rate_silenced = "${var.redis_server_load_rate_silenced}"
|
||||
server_load_rate_threshold_critical = "${var.redis_server_load_rate_threshold_critical}"
|
||||
server_load_rate_threshold_warning = "${var.redis_server_load_rate_threshold_warning}"
|
||||
}
|
||||
@ -115,11 +151,15 @@ module "sqldatabase" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
cpu_silenced = "${var.sqldatabase_cpu_silenced}"
|
||||
cpu_threshold_critical = "${var.sqldatabase_cpu_threshold_critical}"
|
||||
cpu_threshold_warning = "${var.sqldatabase_cpu_threshold_warning}"
|
||||
deadlock_silenced = "${var.sqldatabase_deadlock_silenced}"
|
||||
deadlock_threshold_critical = "${var.sqldatabase_deadlock_threshold_critical}"
|
||||
diskspace_silenced = "${var.sqldatabase_diskspace_silenced}"
|
||||
diskspace_threshold_critical = "${var.sqldatabase_diskspace_threshold_critical}"
|
||||
diskspace_threshold_warning = "${var.sqldatabase_diskspace_threshold_warning}"
|
||||
dtu_silenced = "${var.sqldatabase_dtu_silenced}"
|
||||
dtu_threshold_critical = "${var.sqldatabase_dtu_threshold_critical}"
|
||||
dtu_threshold_warning = "${var.sqldatabase_dtu_threshold_warning}"
|
||||
}
|
||||
@ -134,15 +174,33 @@ module "storage" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
authorization_error_requests_silenced = "${var.storage_authorization_error_requests_silenced}"
|
||||
authorization_error_requests_threshold_critical = "${var.storage_authorization_error_requests_threshold_critical}"
|
||||
authorization_error_requests_threshold_warning = "${var.storage_authorization_error_requests_threshold_warning}"
|
||||
availability_silenced = "${var.storage_availability_silenced}"
|
||||
availability_threshold_critical = "${var.storage_availability_threshold_critical}"
|
||||
availability_threshold_warning = "${var.storage_availability_threshold_warning}"
|
||||
client_other_error_requests_silenced = "${var.storage_client_other_error_requests_silenced}"
|
||||
client_other_error_requests_threshold_critical = "${var.storage_client_other_error_requests_threshold_critical}"
|
||||
client_other_error_requests_threshold_warning = "${var.storage_client_other_error_requests_threshold_warning}"
|
||||
latency_silenced = "${var.storage_latency_silenced}"
|
||||
latency_threshold_critical = "${var.storage_latency_threshold_critical}"
|
||||
latency_threshold_warning = "${var.storage_latency_threshold_warning}"
|
||||
network_error_requests_silenced = "${var.storage_network_error_requests_silenced}"
|
||||
network_error_requests_threshold_critical = "${var.storage_network_error_requests_threshold_critical}"
|
||||
network_error_requests_threshold_warning = "${var.storage_network_error_requests_threshold_warning}"
|
||||
server_other_error_requests_silenced = "${var.storage_server_other_error_requests_silenced}"
|
||||
server_other_error_requests_threshold_critical = "${var.storage_server_other_error_requests_threshold_critical}"
|
||||
server_other_error_requests_threshold_warning = "${var.storage_server_other_error_requests_threshold_warning}"
|
||||
successful_requests_silenced = "${var.storage_successful_requests_silenced}"
|
||||
successful_requests_threshold_critical = "${var.storage_successful_requests_threshold_critical}"
|
||||
successful_requests_threshold_warning = "${var.storage_successful_requests_threshold_warning}"
|
||||
throttling_error_requests_silenced = "${var.storage_throttling_error_requests_silenced}"
|
||||
throttling_error_requests_threshold_critical = "${var.storage_throttling_error_requests_threshold_critical}"
|
||||
throttling_error_requests_threshold_warning = "${var.storage_throttling_error_requests_threshold_warning}"
|
||||
timeout_error_requests_silenced = "${var.storage_timeout_error_requests_silenced}"
|
||||
timeout_error_requests_threshold_critical = "${var.storage_timeout_error_requests_threshold_critical}"
|
||||
timeout_error_requests_threshold_warning = "${var.storage_timeout_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
module "streamanalytics" {
|
||||
@ -155,12 +213,16 @@ module "streamanalytics" {
|
||||
filter_tags_use_defaults = "${var.filter_tags_use_defaults}"
|
||||
filter_tags_custom = "${var.filter_tags_custom}"
|
||||
|
||||
conversion_errors_silenced = "${var.streamanalytics_conversion_errors_silenced}"
|
||||
conversion_errors_threshold_critical = "${var.streamanalytics_conversion_errors_threshold_critical}"
|
||||
conversion_errors_threshold_warning = "${var.streamanalytics_conversion_errors_threshold_warning}"
|
||||
failed_function_requests_silenced = "${var.streamanalytics_failed_function_requests_silenced}"
|
||||
failed_function_requests_threshold_critical = "${var.streamanalytics_failed_function_requests_threshold_critical}"
|
||||
function_requests_threshold_warning = "${var.streamanalytics_function_requests_threshold_warning}"
|
||||
failed_function_requests_threshold_warning = "${var.streamanalytics_failed_function_requests_threshold_warning}"
|
||||
runtime_errors_silenced = "${var.streamanalytics_runtime_errors_silenced}"
|
||||
runtime_errors_threshold_critical = "${var.streamanalytics_runtime_errors_threshold_critical}"
|
||||
runtime_errors_threshold_warning = "${var.streamanalytics_runtime_errors_threshold_warning}"
|
||||
su_utilization_silenced = "${var.streamanalytics_su_utilization_silenced}"
|
||||
su_utilization_threshold_critical = "${var.streamanalytics_su_utilization_threshold_critical}"
|
||||
su_utilization_threshold_warning = "${var.streamanalytics_su_utilization_threshold_warning}"
|
||||
}
|
||||
|
||||
@ -29,13 +29,16 @@ Inputs
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| evictedkeys_limit_silenced | Groups to mute for Redis evicted keys monitor | map | `<map>` | no |
|
||||
| evictedkeys_limit_threshold_critical | Evicted keys limit (critical threshold) | string | `100` | no |
|
||||
| evictedkeys_limit_threshold_warning | Evicted keys limit (warning threshold) | string | `0` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| message | Message sent when a Redis monitor is triggered | string | - | yes |
|
||||
| percent_processor_time_silenced | Groups to mute for Redis processor monitor | map | `<map>` | no |
|
||||
| percent_processor_time_threshold_critical | Processor time percent (critical threshold) | string | `80` | no |
|
||||
| percent_processor_time_threshold_warning | Processor time percent (warning threshold) | string | `60` | no |
|
||||
| server_load_rate_silenced | Groups to mute for Redis server load monitor | map | `<map>` | no |
|
||||
| server_load_rate_threshold_critical | Server CPU load rate (critical threshold) | string | `90` | no |
|
||||
| server_load_rate_threshold_warning | Server CPU load rate (warning threshold) | string | `70` | no |
|
||||
|
||||
|
||||
@ -25,6 +25,18 @@ variable "filter_tags_custom" {
|
||||
}
|
||||
|
||||
# Azure Redis specific
|
||||
variable "status_silenced" {
|
||||
description = "Groups to mute for Redis status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "evictedkeys_limit_silenced" {
|
||||
description = "Groups to mute for Redis evicted keys monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "evictedkeys_limit_threshold_warning" {
|
||||
description = "Evicted keys limit (warning threshold)"
|
||||
default = 0
|
||||
@ -35,6 +47,12 @@ variable "evictedkeys_limit_threshold_critical" {
|
||||
default = 100
|
||||
}
|
||||
|
||||
variable "percent_processor_time_silenced" {
|
||||
description = "Groups to mute for Redis processor monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "percent_processor_time_threshold_critical" {
|
||||
description = "Processor time percent (critical threshold)"
|
||||
default = 80
|
||||
@ -45,6 +63,12 @@ variable "percent_processor_time_threshold_warning" {
|
||||
default = 60
|
||||
}
|
||||
|
||||
variable "server_load_rate_silenced" {
|
||||
description = "Groups to mute for Redis server load monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "server_load_rate_threshold_critical" {
|
||||
description = "Server CPU load rate (critical threshold)"
|
||||
default = 90
|
||||
|
||||
@ -16,6 +16,8 @@ EOF
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.status_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -23,7 +25,7 @@ EOF
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -31,7 +33,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "evictedkeys" {
|
||||
name = "[${var.environment}] Redis {{value}} evictedkeys on {{name}}"
|
||||
name = "[${var.environment}] Redis too many evictedkeys {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -47,6 +49,8 @@ EOF
|
||||
critical = "${var.evictedkeys_limit_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.evictedkeys_limit_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -54,7 +58,7 @@ EOF
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -62,7 +66,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "percent_processor_time" {
|
||||
name = "[${var.environment}] Redis processor time {{value}}% on {{name}}"
|
||||
name = "[${var.environment}] Redis processor time too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -78,6 +82,8 @@ EOF
|
||||
critical = "${var.percent_processor_time_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.percent_processor_time_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -85,7 +91,7 @@ EOF
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -93,7 +99,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "server_load" {
|
||||
name = "[${var.environment}] Redis processor server load {{value}}% on {{name}}"
|
||||
name = "[${var.environment}] Redis server load too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -109,6 +115,8 @@ EOF
|
||||
critical = "${var.server_load_rate_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.server_load_rate_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -116,7 +124,7 @@ EOF
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
|
||||
@ -27,12 +27,16 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| cpu_silenced | Groups to mute for SQL CPU monitor | map | `<map>` | no |
|
||||
| cpu_threshold_critical | CPU usage in percent (critical threshold) | string | `90` | no |
|
||||
| cpu_threshold_warning | CPU usage in percent (warning threshold) | string | `80` | no |
|
||||
| deadlock_silenced | Groups to mute for SQL Deadlock monitor | map | `<map>` | no |
|
||||
| deadlock_threshold_critical | Amount of Deadlocks (critical threshold) | string | `1` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| diskspace_silenced | Groups to mute for SQL disk space monitor | map | `<map>` | no |
|
||||
| diskspace_threshold_critical | Disk space used in percent (critical threshold) | string | `90` | no |
|
||||
| diskspace_threshold_warning | Disk space used in percent (warning threshold) | string | `80` | no |
|
||||
| dtu_silenced | Groups to mute for SQL DTU monitor | map | `<map>` | no |
|
||||
| dtu_threshold_critical | Amount of DTU used (critical threshold) | string | `90` | no |
|
||||
| dtu_threshold_warning | Amount of DTU used (warning threshold) | string | `85` | no |
|
||||
| environment | Architecture Environment | string | - | yes |
|
||||
|
||||
@ -25,6 +25,11 @@ variable "filter_tags_custom" {
|
||||
}
|
||||
|
||||
# Azure SQL Database specific
|
||||
variable "cpu_silenced" {
|
||||
description = "Groups to mute for SQL CPU monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "cpu_threshold_warning" {
|
||||
description = "CPU usage in percent (warning threshold)"
|
||||
@ -36,6 +41,12 @@ variable "cpu_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "diskspace_silenced" {
|
||||
description = "Groups to mute for SQL disk space monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "diskspace_threshold_warning" {
|
||||
description = "Disk space used in percent (warning threshold)"
|
||||
default = "80"
|
||||
@ -46,6 +57,12 @@ variable "diskspace_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "dtu_silenced" {
|
||||
description = "Groups to mute for SQL DTU monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "dtu_threshold_warning" {
|
||||
description = "Amount of DTU used (warning threshold)"
|
||||
default = "85"
|
||||
@ -56,6 +73,12 @@ variable "dtu_threshold_critical" {
|
||||
default = "90"
|
||||
}
|
||||
|
||||
variable "deadlock_silenced" {
|
||||
description = "Groups to mute for SQL Deadlock monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "deadlock_threshold_critical" {
|
||||
description = "Amount of Deadlocks (critical threshold)"
|
||||
default = "1"
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
name = "[${var.environment}] SQL Database CPU high > ${var.cpu_threshold_critical}% on {{name}}"
|
||||
name = "[${var.environment}] SQL Database CPU too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -22,6 +22,8 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
critical = "${var.cpu_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.cpu_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -29,7 +31,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -37,7 +39,7 @@ resource "datadog_monitor" "sql-database_cpu_90_15min" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
name = "[${var.environment}] SQL Database free space < ${var.diskspace_threshold_critical}% on {{name}}"
|
||||
name = "[${var.environment}] SQL Database low free space {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -53,6 +55,8 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
critical = "${var.diskspace_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.diskspace_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -60,7 +64,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -68,7 +72,7 @@ resource "datadog_monitor" "sql-database_free_space_low" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
name = "[${var.environment}] SQL Database DTU Consumption on {{name}} > ${var.dtu_threshold_critical}"
|
||||
name = "[${var.environment}] SQL Database DTU Consumption too high {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -84,6 +88,8 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
critical = "${var.dtu_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.dtu_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -91,7 +97,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -99,7 +105,7 @@ resource "datadog_monitor" "sql-database_dtu_consumption_high" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "sql-database_deadlocks_count" {
|
||||
name = "[${var.environment}] SQL Database Deadlocks too high on {{name}}"
|
||||
name = "[${var.environment}] SQL Database Deadlocks too high {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
type = "metric alert"
|
||||
@ -114,6 +120,8 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
|
||||
critical = "${var.deadlock_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.deadlock_silenced}"
|
||||
|
||||
notify_no_data = false
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -121,7 +129,7 @@ resource "datadog_monitor" "sql-database_deadlocks_count" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
|
||||
@ -32,20 +32,38 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `15` | no |
|
||||
| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `90` | no |
|
||||
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `15` | no |
|
||||
| authorization_error_requests_silenced | Groups to mute for Storage authorization errors monitor | map | `<map>` | no |
|
||||
| authorization_error_requests_threshold_critical | Maximum acceptable percent of authorization error requests for a storage | string | `90` | no |
|
||||
| authorization_error_requests_threshold_warning | Warning regarding acceptable percent of authorization error requests for a storage | string | `50` | no |
|
||||
| availability_silenced | Groups to mute for Storage availability monitor | map | `<map>` | no |
|
||||
| availability_threshold_critical | Minimum acceptable percent of availability for a storage | string | `50` | no |
|
||||
| availability_threshold_warning | Warning regarding acceptable percent of availability for a storage | string | `90` | no |
|
||||
| client_other_error_requests_silenced | Groups to mute for Storage other errors monitor | map | `<map>` | no |
|
||||
| client_other_error_requests_threshold_critical | Maximum acceptable percent of client other error requests for a storage | string | `90` | no |
|
||||
| client_other_error_requests_threshold_warning | Warning regarding acceptable percent of client other error requests for a storage | string | `50` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `1000` | no |
|
||||
| latency_silenced | Groups to mute for Storage latency monitor | map | `<map>` | no |
|
||||
| latency_threshold_critical | Maximum acceptable end to end latency (ms) for a storage | string | `2000` | no |
|
||||
| latency_threshold_warning | Warning regarding acceptable end to end latency (ms) for a storage | string | `1000` | no |
|
||||
| message | Message sent when a Redis monitor is triggered | string | - | yes |
|
||||
| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `5` | no |
|
||||
| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `10` | no |
|
||||
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `90` | no |
|
||||
| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `10` | no |
|
||||
| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `5` | no |
|
||||
| network_error_requests_silenced | Groups to mute for Storage network errors monitor | map | `<map>` | no |
|
||||
| network_error_requests_threshold_critical | Maximum acceptable percent of network error requests for a storage | string | `90` | no |
|
||||
| network_error_requests_threshold_warning | Warning regarding acceptable percent of network error requests for a storage | string | `50` | no |
|
||||
| server_other_error_requests_silenced | Groups to mute for Storage server other errors monitor | map | `<map>` | no |
|
||||
| server_other_error_requests_threshold_critical | Maximum acceptable percent of server other error requests for a storage | string | `90` | no |
|
||||
| server_other_error_requests_threshold_warning | Warning regarding acceptable percent of server other error requests for a storage | string | `50` | no |
|
||||
| successful_requests_silenced | Groups to mute for Storage sucessful requests monitor | map | `<map>` | no |
|
||||
| successful_requests_threshold_critical | Minimum acceptable percent of successful requests for a storage | string | `10` | no |
|
||||
| successful_requests_threshold_warning | Warning regarding acceptable percent of successful requests for a storage | string | `30` | no |
|
||||
| throttling_error_requests_silenced | Groups to mute for Storage throttling error monitor | map | `<map>` | no |
|
||||
| throttling_error_requests_threshold_critical | Maximum acceptable percent of throttling error requests for a storage | string | `90` | no |
|
||||
| throttling_error_requests_threshold_warning | Warning regarding acceptable percent of throttling error requests for a storage | string | `50` | no |
|
||||
| timeout_error_requests_silenced | Groups to mute for Storage timeout monitor | map | `<map>` | no |
|
||||
| timeout_error_requests_threshold_critical | Maximum acceptable percent of timeout error requests for a storage | string | `90` | no |
|
||||
| timeout_error_requests_threshold_warning | Warning regarding acceptable percent of timeout error requests for a storage | string | `50` | no |
|
||||
|
||||
Related documentation
|
||||
---------------------
|
||||
|
||||
@ -25,47 +25,146 @@ variable "filter_tags_custom" {
|
||||
}
|
||||
|
||||
# Azure Storage specific
|
||||
variable "availability_silenced" {
|
||||
description = "Groups to mute for Storage availability monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "availability_threshold_critical" {
|
||||
description = "Minimum acceptable percent of availability for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "availability_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of availability for a storage"
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "successful_requests_silenced" {
|
||||
description = "Groups to mute for Storage sucessful requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "successful_requests_threshold_critical" {
|
||||
description = "Minimum acceptable percent of successful requests for a storage"
|
||||
default = 90
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "successful_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of successful requests for a storage"
|
||||
default = 30
|
||||
}
|
||||
|
||||
variable "latency_silenced" {
|
||||
description = "Groups to mute for Storage latency monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "latency_threshold_critical" {
|
||||
description = "Maximum acceptable end to end latency (ms) for a storage"
|
||||
default = 2000
|
||||
}
|
||||
|
||||
variable "latency_threshold_warning" {
|
||||
description = "Warning regarding acceptable end to end latency (ms) for a storage"
|
||||
default = 1000
|
||||
}
|
||||
|
||||
variable "timeout_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage timeout monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "timeout_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of timeout error requests for a storage"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "timeout_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of timeout error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "network_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage network errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "network_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of network error requests for a storage"
|
||||
default = 5
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "network_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of network error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "throttling_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage throttling error monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "throttling_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of throttling error requests for a storage"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "throttling_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of throttling error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "server_other_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage server other errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "server_other_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of server other error requests for a storage"
|
||||
default = 10
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "server_other_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of server other error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "client_other_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage other errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "client_other_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of client other error requests for a storage"
|
||||
default = 15
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "client_other_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of client other error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
variable "authorization_error_requests_silenced" {
|
||||
description = "Groups to mute for Storage authorization errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "authorization_error_requests_threshold_critical" {
|
||||
description = "Maximum acceptable percent of authorization error requests for a storage"
|
||||
default = 15
|
||||
default = 90
|
||||
}
|
||||
|
||||
variable "authorization_error_requests_threshold_warning" {
|
||||
description = "Warning regarding acceptable percent of authorization error requests for a storage"
|
||||
default = 50
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "availability" {
|
||||
name = "[${var.environment}] Azure Storage {{name}} unavailability detected"
|
||||
name = "[${var.environment}] Azure Storage is down"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -18,15 +18,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.availability_threshold_critical}"
|
||||
warning = "${var.availability_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.availability_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -36,7 +39,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "successful_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{name}} too much failed requests"
|
||||
name = "[${var.environment}] Azure Storage too few successful requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -47,15 +50,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.successful_requests_threshold_critical}"
|
||||
warning = "${var.successful_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.successful_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -65,7 +71,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "latency" {
|
||||
name = "[${var.environment}] Azure Storage {{name}} too high end to end latency"
|
||||
name = "[${var.environment}] Azure Storage too high end to end latency {{comparator}} {{#is_alert}}{{threshold}}ms{{/is_alert}}{{#is_warning}}{{warn_threshold}}ms{{/is_warning}} ({{value}}ms)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -76,15 +82,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.latency_threshold_critical}"
|
||||
warning = "${var.latency_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.latency_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -94,7 +103,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "timeout_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of timeout error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many timeout errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -105,15 +114,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.timeout_error_requests_threshold_critical}"
|
||||
warning = "${var.timeout_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.timeout_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -123,7 +135,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "network_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many network errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -134,15 +146,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.network_error_requests_threshold_critical}"
|
||||
warning = "${var.network_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.network_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -152,7 +167,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "throttling_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many throttling errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -163,15 +178,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.throttling_error_requests_threshold_critical}"
|
||||
warning = "${var.throttling_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.throttling_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -181,7 +199,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "server_other_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many server_other errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -192,15 +210,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.server_other_error_requests_threshold_critical}"
|
||||
warning = "${var.server_other_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.server_other_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -210,7 +231,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "client_other_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many client_other errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -221,15 +242,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.client_other_error_requests_threshold_critical}"
|
||||
warning = "${var.client_other_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.client_other_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -239,7 +263,7 @@ EOF
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "authorization_error_requests" {
|
||||
name = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}"
|
||||
name = "[${var.environment}] Azure Storage too many authorization errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -250,15 +274,18 @@ EOF
|
||||
|
||||
thresholds {
|
||||
critical = "${var.authorization_error_requests_threshold_critical}"
|
||||
warning = "${var.authorization_error_requests_threshold_warning}"
|
||||
}
|
||||
|
||||
silenced = "${var.authorization_error_requests_silenced}"
|
||||
|
||||
type = "metric alert"
|
||||
notify_no_data = false
|
||||
notify_audit = false
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
|
||||
@ -19,17 +19,22 @@ Inputs
|
||||
|
||||
| Name | Description | Type | Default | Required |
|
||||
|------|-------------|:----:|:-----:|:-----:|
|
||||
| conversion_errors_silenced | Groups to mute for Stream Analytics conversion errors monitor | map | `<map>` | no |
|
||||
| conversion_errors_threshold_critical | Conversion errors limit (critical threshold) | string | `10` | no |
|
||||
| conversion_errors_threshold_warning | Conversion errors limit (warning threshold) | string | `0` | no |
|
||||
| delay | Delay in seconds for the metric evaluation | string | `600` | no |
|
||||
| environment | Architecture environment | string | - | yes |
|
||||
| failed_function_requests_silenced | Groups to mute for Stream Analytics failed requests monitor | map | `<map>` | no |
|
||||
| failed_function_requests_threshold_critical | Failed Function Request rate limit (critical threshold) | string | `10` | no |
|
||||
| failed_function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||
| filter_tags_custom | Tags used for custom filtering when filter_tags_use_defaults is false | string | `*` | no |
|
||||
| filter_tags_use_defaults | Use default filter tags convention | string | `true` | no |
|
||||
| function_requests_threshold_warning | Failed Function Request rate limit (warning threshold) | string | `0` | no |
|
||||
| message | Message sent when a Redis monitor is triggered | string | - | yes |
|
||||
| runtime_errors_silenced | Groups to mute for Stream Analytics runtime errors monitor | map | `<map>` | no |
|
||||
| runtime_errors_threshold_critical | Runtime errors limit (critical threshold) | string | `10` | no |
|
||||
| runtime_errors_threshold_warning | Runtime errors limit (warning threshold) | string | `0` | no |
|
||||
| status_silenced | Groups to mute for Stream Analytics status monitor | map | `<map>` | no |
|
||||
| su_utilization_silenced | Groups to mute for Stream Analytics utilization monitor | map | `<map>` | no |
|
||||
| su_utilization_threshold_critical | Streaming Unit utilization rate limit (critical threshold) | string | `80` | no |
|
||||
| su_utilization_threshold_warning | Streaming Unit utilization rate limit (warning threshold) | string | `60` | no |
|
||||
|
||||
|
||||
@ -25,6 +25,18 @@ variable "filter_tags_custom" {
|
||||
}
|
||||
|
||||
# Azure Stream Analytics specific
|
||||
variable "status_silenced" {
|
||||
description = "Groups to mute for Stream Analytics status monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "su_utilization_silenced" {
|
||||
description = "Groups to mute for Stream Analytics utilization monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "su_utilization_threshold_warning" {
|
||||
description = "Streaming Unit utilization rate limit (warning threshold)"
|
||||
default = 60
|
||||
@ -35,7 +47,13 @@ variable "su_utilization_threshold_critical" {
|
||||
default = 80
|
||||
}
|
||||
|
||||
variable "function_requests_threshold_warning" {
|
||||
variable "failed_function_requests_silenced" {
|
||||
description = "Groups to mute for Stream Analytics failed requests monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "failed_function_requests_threshold_warning" {
|
||||
description = "Failed Function Request rate limit (warning threshold)"
|
||||
default = 0
|
||||
}
|
||||
@ -45,6 +63,12 @@ variable "failed_function_requests_threshold_critical" {
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "conversion_errors_silenced" {
|
||||
description = "Groups to mute for Stream Analytics conversion errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "conversion_errors_threshold_warning" {
|
||||
description = "Conversion errors limit (warning threshold)"
|
||||
default = 0
|
||||
@ -55,6 +79,12 @@ variable "conversion_errors_threshold_critical" {
|
||||
default = 10
|
||||
}
|
||||
|
||||
variable "runtime_errors_silenced" {
|
||||
description = "Groups to mute for Stream Analytics runtime errors monitor"
|
||||
type = "map"
|
||||
default = {}
|
||||
}
|
||||
|
||||
variable "runtime_errors_threshold_warning" {
|
||||
description = "Runtime errors limit (warning threshold)"
|
||||
default = 0
|
||||
|
||||
@ -7,7 +7,7 @@ data "template_file" "filter" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "status" {
|
||||
name = "[${var.environment}] Stream Analytics Status is not ok on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics is down"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -16,6 +16,8 @@ resource "datadog_monitor" "status" {
|
||||
|
||||
type = "metric alert"
|
||||
|
||||
silenced = "${var.status_silenced}"
|
||||
|
||||
notify_no_data = true
|
||||
evaluation_delay = "${var.delay}"
|
||||
renotify_interval = 0
|
||||
@ -23,7 +25,7 @@ resource "datadog_monitor" "status" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -31,7 +33,7 @@ resource "datadog_monitor" "status" {
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "su_utilization" {
|
||||
name = "[${var.environment}] Stream Analytics streaming Units utilization at more than ${var.su_utilization_threshold_critical}% on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics streaming units utilization too high {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -49,7 +51,7 @@ resource "datadog_monitor" "su_utilization" {
|
||||
timeout_h = 0
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -58,11 +60,13 @@ resource "datadog_monitor" "su_utilization" {
|
||||
critical = "${var.su_utilization_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.su_utilization_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "failed_function_requests" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.failed_function_requests_threshold_critical} failed function requests on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics too many failed requests {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -81,20 +85,22 @@ resource "datadog_monitor" "failed_function_requests" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
thresholds {
|
||||
warning = "${var.function_requests_threshold_warning}"
|
||||
warning = "${var.failed_function_requests_threshold_warning}"
|
||||
critical = "${var.failed_function_requests_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.failed_function_requests_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "conversion_errors" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.conversion_errors_threshold_critical} conversion errors on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics too many conversion errors {{comparator}} {{#is_alert}}{{threshold}}%{{/is_alert}}{{#is_warning}}{{warn_threshold}}%{{/is_warning}} ({{value}}%)"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -112,7 +118,7 @@ resource "datadog_monitor" "conversion_errors" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -121,11 +127,13 @@ resource "datadog_monitor" "conversion_errors" {
|
||||
critical = "${var.conversion_errors_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.conversion_errors_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
resource "datadog_monitor" "runtime_errors" {
|
||||
name = "[${var.environment}] Stream Analytics more than ${var.runtime_errors_threshold_critical} runtime errors on {{name}}"
|
||||
name = "[${var.environment}] Stream Analytics too many runtime errors {{comparator}} {{#is_alert}}{{threshold}}{{/is_alert}}{{#is_warning}}{{warn_threshold}}{{/is_warning}} ({{value}})"
|
||||
message = "${var.message}"
|
||||
|
||||
query = <<EOF
|
||||
@ -143,7 +151,7 @@ resource "datadog_monitor" "runtime_errors" {
|
||||
timeout_h = 1
|
||||
include_tags = true
|
||||
locked = false
|
||||
require_full_window = true
|
||||
require_full_window = false
|
||||
new_host_delay = "${var.delay}"
|
||||
no_data_timeframe = 20
|
||||
|
||||
@ -152,5 +160,7 @@ resource "datadog_monitor" "runtime_errors" {
|
||||
critical = "${var.runtime_errors_threshold_critical}"
|
||||
}
|
||||
|
||||
silenced = "${var.runtime_errors_silenced}"
|
||||
|
||||
tags = ["env:${var.environment}", "resource:streamanalytics", "team:azure", "provider:azure"]
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user