Merged in TER-209-add-bitbucket-pipelines-to-suppo (pull request #40)

TER-209 add bitbucket pipelines to suppo

Approved-by: Adrien Bréfort <adrien.brefort@fr.clara.net>
This commit is contained in:
Adrien Bréfort 2018-01-23 14:10:03 +00:00
commit 37d83aa0f2
23 changed files with 231 additions and 202 deletions

13
.gitignore vendored
View File

@ -1,10 +1,3 @@
# Ignore all volatile files .terraform
**/.terraform/modules main.tf
**/.terraform/plugins terraform.tfvars
**/terraform.tfstate*.backup
# Ignore all credentials files
**/terraform.tfvars
# Ignore all but root state files
**/terraform.tfstate

15
bitbucket-pipelines.yml Normal file
View File

@ -0,0 +1,15 @@
image: hashicorp/terraform:0.10.8
pipelines:
default:
- step:
name: Format
script:
- terraform fmt -write=false -diff -check
- step:
name: Validate
script:
- mv main.tf.ci main.tf
- mv terraform.tfvars.ci terraform.tfvars
- terraform init
- terraform validate

View File

@ -11,7 +11,8 @@ resource "datadog_monitor" "es_cluster_status" {
name = "[${var.environment}] ElasticSearch cluster status is not green" name = "[${var.environment}] ElasticSearch cluster status is not green"
message = "${var.message}" message = "${var.message}"
type = "query alert" type = "query alert"
query = <<EOF query = <<EOF
max(last_30m): ( max(last_30m): (
avg:aws.es.cluster_statusred{${data.template_file.filter.rendered}} by {region,name} * 2 + avg:aws.es.cluster_statusred{${data.template_file.filter.rendered}} by {region,name} * 2 +
@ -44,7 +45,8 @@ resource "datadog_monitor" "es_free_space_low" {
name = "[${var.environment}] ElasticSearch cluster free storage space < ${var.diskspace_threshold_critical}%" name = "[${var.environment}] ElasticSearch cluster free storage space < ${var.diskspace_threshold_critical}%"
message = "${var.message}" message = "${var.message}"
type = "query alert" type = "query alert"
query = <<EOF query = <<EOF
avg(last_15m): ( avg(last_15m): (
avg:aws.es.free_storage_space{${data.template_file.filter.rendered}} by {region,name} / (${var.es_cluster_volume_size}*1000) * 100 avg:aws.es.free_storage_space{${data.template_file.filter.rendered}} by {region,name} / (${var.es_cluster_volume_size}*1000) * 100
@ -75,7 +77,8 @@ resource "datadog_monitor" "es_cpu_90_15min" {
name = "[${var.environment}] ElasticSearch cluster CPU high > ${var.cpu_threshold_critical}%" name = "[${var.environment}] ElasticSearch cluster CPU high > ${var.cpu_threshold_critical}%"
message = "${var.message}" message = "${var.message}"
type = "query alert" type = "query alert"
query = <<EOF query = <<EOF
avg(last_15m): ( avg(last_15m): (
avg:aws.es.cpuutilization{${data.template_file.filter.rendered}} by {region,name} avg:aws.es.cpuutilization{${data.template_file.filter.rendered}} by {region,name}

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = "string"
} }
# Global DataDog # Global DataDog
@ -27,20 +27,20 @@ variable "filter_tags_custom" {
# Azure API Management specific # Azure API Management specific
variable "failed_requests_threshold_critical" { variable "failed_requests_threshold_critical" {
description = "Maximum acceptable percent of failed requests" description = "Maximum acceptable percent of failed requests"
default = 5 default = 5
} }
variable "other_requests_threshold_critical" { variable "other_requests_threshold_critical" {
description = "Maximum acceptable percent of other requests" description = "Maximum acceptable percent of other requests"
default = 5 default = 5
} }
variable "unauthorized_requests_threshold_critical" { variable "unauthorized_requests_threshold_critical" {
description = "Maximum acceptable percent of unauthorized requests" description = "Maximum acceptable percent of unauthorized requests"
default = 5 default = 5
} }
variable "successful_requests_threshold_critical" { variable "successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests" description = "Minimum acceptable percent of successful requests"
default = 90 default = 90
} }

View File

@ -15,7 +15,8 @@ resource "datadog_monitor" "apimgt_status" {
query = <<EOF query = <<EOF
avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1 avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
EOF EOF
type = "metric alert"
type = "metric alert"
thresholds { thresholds {
critical = 1 critical = 1
@ -47,7 +48,7 @@ resource "datadog_monitor" "apimgt_failed_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.failed_requests_threshold_critical}" critical = "${var.failed_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -77,7 +78,7 @@ resource "datadog_monitor" "apimgt_other_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.other_requests_threshold_critical}" critical = "${var.other_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -107,7 +108,7 @@ resource "datadog_monitor" "apimgt_unauthorized_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.unauthorized_requests_threshold_critical}" critical = "${var.unauthorized_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -137,7 +138,7 @@ resource "datadog_monitor" "apimgt_successful_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.successful_requests_threshold_critical}" critical = "${var.successful_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"

View File

@ -26,7 +26,7 @@ resource "datadog_monitor" "appservices_response_time" {
critical = "${var.response_time_threshold_critical}" critical = "${var.response_time_threshold_critical}"
} }
notify_no_data = true # Will notify when no data is received notify_no_data = true # Will notify when no data is received
renotify_interval = 0 renotify_interval = 0
require_full_window = true require_full_window = true
timeout_h = 0 timeout_h = 0
@ -55,7 +55,7 @@ resource "datadog_monitor" "appservices_memory_usage_count" {
critical = "${var.memory_usage_threshold_critical}" critical = "${var.memory_usage_threshold_critical}"
} }
notify_no_data = true # Will notify when no data is received notify_no_data = true # Will notify when no data is received
renotify_interval = 0 renotify_interval = 0
require_full_window = true require_full_window = true
timeout_h = 0 timeout_h = 0
@ -146,7 +146,7 @@ resource "datadog_monitor" "appservices_http_success_status_rate" {
critical = "${var.http_successful_requests_threshold_critical}" critical = "${var.http_successful_requests_threshold_critical}"
} }
notify_no_data = false # Will notify when no data is received notify_no_data = false # Will notify when no data is received
renotify_interval = 0 renotify_interval = 0
require_full_window = true require_full_window = true
timeout_h = 1 timeout_h = 1

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = "string"
} }
# Global DataDog # Global DataDog
@ -11,7 +11,7 @@ variable "message" {
variable "delay" { variable "delay" {
description = "Delay in seconds for the metric evaluation" description = "Delay in seconds for the metric evaluation"
default = 600 default = 600
} }
variable "filter_tags_use_defaults" { variable "filter_tags_use_defaults" {
@ -26,20 +26,20 @@ variable "filter_tags_custom" {
variable "failed_requests_rate_thresold_critical" { variable "failed_requests_rate_thresold_critical" {
description = "Failed requests ratio (percentage) to trigger the critical alert" description = "Failed requests ratio (percentage) to trigger the critical alert"
default = 3 default = 3
} }
variable "failed_requests_rate_thresold_warning" { variable "failed_requests_rate_thresold_warning" {
description = "Failed requests ratio (percentage) to trigger a warning alert" description = "Failed requests ratio (percentage) to trigger a warning alert"
default = 1 default = 1
} }
variable "errors_rate_thresold_critical" { variable "errors_rate_thresold_critical" {
description = "Errors ratio (percentage) to trigger the critical alert" description = "Errors ratio (percentage) to trigger the critical alert"
default = 3 default = 3
} }
variable "errors_rate_thresold_warning" { variable "errors_rate_thresold_warning" {
description = "Errors ratio (percentage) to trigger a warning alert" description = "Errors ratio (percentage) to trigger a warning alert"
default = 1 default = 1
} }

View File

@ -13,7 +13,8 @@ resource "datadog_monitor" "eventhub_status" {
query = <<EOF query = <<EOF
avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1 avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -41,7 +42,8 @@ resource "datadog_monitor" "eventhub_failed_requests" {
avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
) * 100 > ${var.failed_requests_rate_thresold_critical} ) * 100 > ${var.failed_requests_rate_thresold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
thresholds { thresholds {
critical = "${var.failed_requests_rate_thresold_critical}" critical = "${var.failed_requests_rate_thresold_critical}"
@ -78,7 +80,8 @@ resource "datadog_monitor" "eventhub_errors" {
avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
) * 100 > ${var.errors_rate_thresold_critical} ) * 100 > ${var.errors_rate_thresold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
thresholds { thresholds {
critical = "${var.errors_rate_thresold_critical}" critical = "${var.errors_rate_thresold_critical}"

View File

@ -16,7 +16,7 @@ variable "message" {
variable "filter_tags" { variable "filter_tags" {
description = "Tags used for filtering" description = "Tags used for filtering"
default = "*" default = "*"
} }
# Azure IOT hubs specific # Azure IOT hubs specific

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = "string"
} }
# Global DataDog # Global DataDog

View File

@ -1,7 +1,7 @@
# Global Terraform # Global Terraform
variable "environment" { variable "environment" {
description = "Architecture environment" description = "Architecture environment"
type = "string" type = "string"
} }
# Global DataDog # Global DataDog
@ -27,46 +27,45 @@ variable "filter_tags_custom" {
# Azure Storage specific # Azure Storage specific
variable "availability_threshold_critical" { variable "availability_threshold_critical" {
description = "Minimum acceptable percent of availability for a storage" description = "Minimum acceptable percent of availability for a storage"
default = 90 default = 90
} }
variable "successful_requests_threshold_critical" { variable "successful_requests_threshold_critical" {
description = "Minimum acceptable percent of successful requests for a storage" description = "Minimum acceptable percent of successful requests for a storage"
default = 90 default = 90
} }
variable "latency_threshold_critical" { variable "latency_threshold_critical" {
description = "Maximum acceptable end to end latency (ms) for a storage" description = "Maximum acceptable end to end latency (ms) for a storage"
default = 1000 default = 1000
} }
variable "timeout_error_requests_threshold_critical" { variable "timeout_error_requests_threshold_critical" {
description = "Maximum acceptable percent of timeout error requests for a storage" description = "Maximum acceptable percent of timeout error requests for a storage"
default = 5 default = 5
} }
variable "network_error_requests_threshold_critical" { variable "network_error_requests_threshold_critical" {
description = "Maximum acceptable percent of network error requests for a storage" description = "Maximum acceptable percent of network error requests for a storage"
default = 5 default = 5
} }
variable "throttling_error_requests_threshold_critical" { variable "throttling_error_requests_threshold_critical" {
description = "Maximum acceptable percent of throttling error requests for a storage" description = "Maximum acceptable percent of throttling error requests for a storage"
default = 10 default = 10
} }
variable "server_other_error_requests_threshold_critical" { variable "server_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of server other error requests for a storage" description = "Maximum acceptable percent of server other error requests for a storage"
default = 10 default = 10
} }
variable "client_other_error_requests_threshold_critical" { variable "client_other_error_requests_threshold_critical" {
description = "Maximum acceptable percent of client other error requests for a storage" description = "Maximum acceptable percent of client other error requests for a storage"
default = 15 default = 15
} }
variable "authorization_error_requests_threshold_critical" { variable "authorization_error_requests_threshold_critical" {
description = "Maximum acceptable percent of authorization error requests for a storage" description = "Maximum acceptable percent of authorization error requests for a storage"
default = 15 default = 15
} }

View File

@ -17,7 +17,7 @@ resource "datadog_monitor" "availability" {
EOF EOF
thresholds { thresholds {
critical = "${var.availability_threshold_critical}" critical = "${var.availability_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -46,7 +46,7 @@ resource "datadog_monitor" "successful_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.successful_requests_threshold_critical}" critical = "${var.successful_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -75,7 +75,7 @@ resource "datadog_monitor" "latency" {
EOF EOF
thresholds { thresholds {
critical = "${var.latency_threshold_critical}" critical = "${var.latency_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -104,7 +104,7 @@ resource "datadog_monitor" "timeout_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.timeout_error_requests_threshold_critical}" critical = "${var.timeout_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -122,7 +122,6 @@ EOF
tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "network_error_requests" { resource "datadog_monitor" "network_error_requests" {
name = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}" name = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}"
message = "${var.message}" message = "${var.message}"
@ -134,7 +133,7 @@ resource "datadog_monitor" "network_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.network_error_requests_threshold_critical}" critical = "${var.network_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -152,7 +151,6 @@ EOF
tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "throttling_error_requests" { resource "datadog_monitor" "throttling_error_requests" {
name = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}" name = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}"
message = "${var.message}" message = "${var.message}"
@ -164,7 +162,7 @@ resource "datadog_monitor" "throttling_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.throttling_error_requests_threshold_critical}" critical = "${var.throttling_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -182,7 +180,6 @@ EOF
tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "server_other_error_requests" { resource "datadog_monitor" "server_other_error_requests" {
name = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}" name = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}"
message = "${var.message}" message = "${var.message}"
@ -194,7 +191,7 @@ resource "datadog_monitor" "server_other_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.server_other_error_requests_threshold_critical}" critical = "${var.server_other_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -212,7 +209,6 @@ EOF
tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "client_other_error_requests" { resource "datadog_monitor" "client_other_error_requests" {
name = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}" name = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}"
message = "${var.message}" message = "${var.message}"
@ -224,7 +220,7 @@ resource "datadog_monitor" "client_other_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.client_other_error_requests_threshold_critical}" critical = "${var.client_other_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"
@ -242,7 +238,6 @@ EOF
tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"] tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
} }
resource "datadog_monitor" "authorization_error_requests" { resource "datadog_monitor" "authorization_error_requests" {
name = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}" name = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}"
message = "${var.message}" message = "${var.message}"
@ -254,7 +249,7 @@ resource "datadog_monitor" "authorization_error_requests" {
EOF EOF
thresholds { thresholds {
critical = "${var.authorization_error_requests_threshold_critical}" critical = "${var.authorization_error_requests_threshold_critical}"
} }
type = "metric alert" type = "metric alert"

View File

@ -27,40 +27,40 @@ variable "filter_tags_custom" {
# Azure Stream Analytics specific # Azure Stream Analytics specific
variable "su_utilization_threshold_warning" { variable "su_utilization_threshold_warning" {
description = "Streaming Unit utilization rate limit (warning threshold)" description = "Streaming Unit utilization rate limit (warning threshold)"
default = 60 default = 60
} }
variable "su_utilization_threshold_critical" { variable "su_utilization_threshold_critical" {
description = "Streaming Unit utilization rate limit (critical threshold)" description = "Streaming Unit utilization rate limit (critical threshold)"
default = 80 default = 80
} }
variable "function_requests_threshold_warning" { variable "function_requests_threshold_warning" {
description = "Failed Function Request rate limit (warning threshold)" description = "Failed Function Request rate limit (warning threshold)"
default = 0 default = 0
} }
variable "failed_function_requests_threshold_critical" { variable "failed_function_requests_threshold_critical" {
description = "Failed Function Request rate limit (critical threshold)" description = "Failed Function Request rate limit (critical threshold)"
default = 10 default = 10
} }
variable "conversion_errors_threshold_warning" { variable "conversion_errors_threshold_warning" {
description = "Conversion errors limit (warning threshold)" description = "Conversion errors limit (warning threshold)"
default = 0 default = 0
} }
variable "conversion_errors_threshold_critical" { variable "conversion_errors_threshold_critical" {
description = "Conversion errors limit (critical threshold)" description = "Conversion errors limit (critical threshold)"
default = 10 default = 10
} }
variable "runtime_errors_threshold_warning" { variable "runtime_errors_threshold_warning" {
description = "Runtime errors limit (warning threshold)" description = "Runtime errors limit (warning threshold)"
default = 0 default = 0
} }
variable "runtime_errors_threshold_critical" { variable "runtime_errors_threshold_critical" {
description = "Runtime errors limit (critical threshold)" description = "Runtime errors limit (critical threshold)"
default = 10 default = 10
} }

View File

@ -13,7 +13,8 @@ resource "datadog_monitor" "status" {
query = <<EOF query = <<EOF
avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1 avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = true notify_no_data = true
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -38,7 +39,8 @@ resource "datadog_monitor" "su_utilization" {
avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,region,name} avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.su_utilization_threshold_critical} ) > ${var.su_utilization_threshold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -50,6 +52,7 @@ resource "datadog_monitor" "su_utilization" {
require_full_window = true require_full_window = true
new_host_delay = "${var.delay}" new_host_delay = "${var.delay}"
no_data_timeframe = 20 no_data_timeframe = 20
thresholds { thresholds {
warning = "${var.su_utilization_threshold_warning}" warning = "${var.su_utilization_threshold_warning}"
critical = "${var.su_utilization_threshold_critical}" critical = "${var.su_utilization_threshold_critical}"
@ -68,7 +71,8 @@ resource "datadog_monitor" "failed_function_requests" {
avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count() avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
) * 100 > ${var.failed_function_requests_threshold_critical} ) * 100 > ${var.failed_function_requests_threshold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -80,6 +84,7 @@ resource "datadog_monitor" "failed_function_requests" {
require_full_window = true require_full_window = true
new_host_delay = "${var.delay}" new_host_delay = "${var.delay}"
no_data_timeframe = 20 no_data_timeframe = 20
thresholds { thresholds {
warning = "${var.function_requests_threshold_warning}" warning = "${var.function_requests_threshold_warning}"
critical = "${var.failed_function_requests_threshold_critical}" critical = "${var.failed_function_requests_threshold_critical}"
@ -97,7 +102,8 @@ resource "datadog_monitor" "conversion_errors" {
avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name} avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.conversion_errors_threshold_critical} ) > ${var.conversion_errors_threshold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -109,6 +115,7 @@ resource "datadog_monitor" "conversion_errors" {
require_full_window = true require_full_window = true
new_host_delay = "${var.delay}" new_host_delay = "${var.delay}"
no_data_timeframe = 20 no_data_timeframe = 20
thresholds { thresholds {
warning = "${var.conversion_errors_threshold_warning}" warning = "${var.conversion_errors_threshold_warning}"
critical = "${var.conversion_errors_threshold_critical}" critical = "${var.conversion_errors_threshold_critical}"
@ -126,7 +133,8 @@ resource "datadog_monitor" "runtime_errors" {
avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name} avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
) > ${var.runtime_errors_threshold_critical} ) > ${var.runtime_errors_threshold_critical}
EOF EOF
type = "metric alert"
type = "metric alert"
notify_no_data = false notify_no_data = false
evaluation_delay = "${var.delay}" evaluation_delay = "${var.delay}"
@ -138,6 +146,7 @@ resource "datadog_monitor" "runtime_errors" {
require_full_window = true require_full_window = true
new_host_delay = "${var.delay}" new_host_delay = "${var.delay}"
no_data_timeframe = 20 no_data_timeframe = 20
thresholds { thresholds {
warning = "${var.runtime_errors_threshold_warning}" warning = "${var.runtime_errors_threshold_warning}"
critical = "${var.runtime_errors_threshold_critical}" critical = "${var.runtime_errors_threshold_critical}"

View File

@ -1,8 +1,8 @@
resource "datadog_monitor" "cloud_sql_cpu_90" { resource "datadog_monitor" "cloud_sql_cpu_90" {
name = "Cloud SQL CPU high > 90%" name = "Cloud SQL CPU high > 90%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90" query = "avg(last_5m):avg:gcp.cloudsql.database.cpu.utilization{project_id:${var.project_id}} >= 90"
type = "query alert" type = "query alert"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
@ -18,9 +18,9 @@ resource "datadog_monitor" "cloud_sql_cpu_90" {
} }
resource "datadog_monitor" "cloud_sql_disk_space" { resource "datadog_monitor" "cloud_sql_disk_space" {
name = "Cloud SQL free disk space < 10%" name = "Cloud SQL free disk space < 10%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90" query = "avg(last_5m):avg:gcp.cloudsql.database.disk.bytes_used{project_id:${var.project_id}} by {database_id} / avg:gcp.cloudsql.database.disk.quota{project_id:${var.project_id}} by {database_id} * 100 >= 90"
thresholds { thresholds {
warning = 70 warning = 70
@ -42,60 +42,58 @@ resource "datadog_monitor" "cloud_sql_disk_space" {
} }
resource "datadog_monitor" "cloud_sql_connection_80" { resource "datadog_monitor" "cloud_sql_connection_80" {
name = "Cloud SQL MySQL connection > 80% of max connections" name = "Cloud SQL MySQL connection > 80% of max connections"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500" query = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500"
type = "metric alert" type = "metric alert"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
notify_audit = false notify_audit = false
timeout_h = 0 timeout_h = 0
include_tags = true include_tags = true
locked = false locked = false
require_full_window = true require_full_window = true
new_host_delay = 300 new_host_delay = 300
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
no_data_timeframe = 20 no_data_timeframe = 20
} }
resource "datadog_monitor" "cloud_sql_lag" { resource "datadog_monitor" "cloud_sql_lag" {
name = "Cloud SQL MySQL lag > 45min" name = "Cloud SQL MySQL lag > 45min"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700" query = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700"
type = "metric alert" type = "metric alert"
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
} }
resource "datadog_monitor" "cloud_sql_replication" { resource "datadog_monitor" "cloud_sql_replication" {
name = "Cloud SQL Failover not ready to replication" name = "Cloud SQL Failover not ready to replication"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0" query = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0"
type = "metric alert" type = "metric alert"
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
} }

View File

@ -1,7 +1,7 @@
resource "datadog_monitor" "kubernetes_cluster_cpu" { resource "datadog_monitor" "kubernetes_cluster_cpu" {
name = "Kubernetes cluster CPU High > 85%" name = "Kubernetes cluster CPU High > 85%"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85" query = "avg(last_5m):avg:system.cpu.system{*} by {cluster-name} + avg:system.cpu.user{*} by {cluster-name} > 85"
thresholds { thresholds {
warning = 75 warning = 75
@ -25,7 +25,7 @@ resource "datadog_monitor" "kubernetes_cluster_cpu" {
resource "datadog_monitor" "kubernetes_kubelet_check" { resource "datadog_monitor" "kubernetes_kubelet_check" {
name = "Kubernetes kubelet check down" name = "Kubernetes kubelet check down"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()" query = "\"kubernetes.kubelet.check\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
thresholds { thresholds {
warning = 0 warning = 0
@ -48,8 +48,10 @@ resource "datadog_monitor" "kubernetes_kubelet_check" {
resource "datadog_monitor" "kubernetes_kubelet_ping" { resource "datadog_monitor" "kubernetes_kubelet_ping" {
name = "Kubernetes kubelet ping not ok" name = "Kubernetes kubelet ping not ok"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"
thresholds { thresholds {
warning = 0 warning = 0
@ -71,10 +73,10 @@ resource "datadog_monitor" "kubernetes_kubelet_ping" {
} }
resource "datadog_monitor" "kubernetes_pods_unavailable" { resource "datadog_monitor" "kubernetes_pods_unavailable" {
name = "Kubernetes pods unavailable" name = "Kubernetes pods unavailable"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1" query = "max(last_5m):avg:kubernetes_state.deployment.replicas_desired{!namespace:cronetes} by {cluster-name,namespace,deployment} - avg:kubernetes_state.deployment.replicas_unavailable{!namespace:cronetes} by {cluster-name,namespace,deployment} + 1 < 1"
type = "query alert" type = "query alert"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
@ -90,10 +92,10 @@ resource "datadog_monitor" "kubernetes_pods_unavailable" {
} }
resource "datadog_monitor" "kubernetes_node_status" { resource "datadog_monitor" "kubernetes_node_status" {
name = "Kubernetes node status" name = "Kubernetes node status"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0" query = "max(last_5m):avg:kubernetes_state.node.status{!namespace:cronetes} by {cluster-name,namespace,deployment} <= 0"
type = "metric alert" type = "metric alert"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
@ -108,7 +110,7 @@ resource "datadog_monitor" "kubernetes_node_status" {
no_data_timeframe = 20 no_data_timeframe = 20
} }
type = "query alert" /* type = "query alert"
thresholds { thresholds {
# warning = 75 # warning = 75
@ -126,4 +128,5 @@ resource "datadog_monitor" "kubernetes_node_status" {
notify_no_data = false notify_no_data = false
renotify_interval = 0 renotify_interval = 0
no_data_timeframe = 20 no_data_timeframe = 20
} }*/

View File

@ -25,7 +25,7 @@ resource "datadog_monitor" "cpu_95_5min" {
query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} > 95" query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,!dd_custom_cpu:enabled} by {host} > 95"
type = "query alert" type = "query alert"
count = "${var.linux-basics == "enabled" ? 1 : 0}" count = "${var.linux-basics == "enabled" ? 1 : 0}"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
@ -41,12 +41,12 @@ resource "datadog_monitor" "cpu_95_5min" {
} }
resource "datadog_monitor" "datadog_free_disk_space_5" { resource "datadog_monitor" "datadog_free_disk_space_5" {
name = "Free disk space < 5%" name = "Free disk space < 5%"
message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5" query = "sum(last_5m):avg:system.disk.free{*} by {host,device} / avg:system.disk.total{*} by {host,device} * 100 < 5"
type = "query alert" type = "query alert"
count = "${var.linux-basics == "enabled" ? 1 : 0}" count = "${var.linux-basics == "enabled" ? 1 : 0}"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60
@ -146,10 +146,10 @@ resource "datadog_monitor" "datadog_cpu_load" {
} }
resource "datadog_monitor" "datadog_free_memory" { resource "datadog_monitor" "datadog_free_memory" {
name = "Free memory < 5%" name = "Free memory < 5%"
message = "Debugging alert - no escalation" message = "Debugging alert - no escalation"
query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" query = "sum(last_1m):avg:system.mem.free{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
type = "query alert" type = "query alert"
notify_no_data = false notify_no_data = false
renotify_interval = 60 renotify_interval = 60

View File

@ -1,11 +1,12 @@
resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" { resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" {
name = "Kubernetes Redis container CPU High > 95% for 5 min" name = "Kubernetes Redis container CPU High > 95% for 5 min"
#message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95" query = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95"
thresholds { thresholds {
# warning = 80 # warning = 80
critical = 95 critical = 95
} }
@ -24,7 +25,8 @@ resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" {
} }
resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" { resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
name = "Kubernetes Redis container CPU High > 80% for 15 min" name = "Kubernetes Redis container CPU High > 80% for 15 min"
#message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}" #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
@ -32,7 +34,7 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
type = "query alert" type = "query alert"
thresholds { thresholds {
# warning = 75 # warning = 75
critical = 80 critical = 80
} }
@ -72,3 +74,4 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
# renotify_interval = 0 # renotify_interval = 0
# no_data_timeframe = 20 # no_data_timeframe = 20
# } # }

View File

@ -1,7 +1,7 @@
resource "datadog_monitor" "redis_connection" { resource "datadog_monitor" "redis_connection" {
name = "Redis connection is down (Datadog check)" name = "Redis connection is down (Datadog check)"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()" query = "\"redis.can_connect\".over(\"app:redis\").by(\"*\").last(1).pct_by_status()"
thresholds { thresholds {
critical = 50 critical = 50
@ -22,66 +22,59 @@ resource "datadog_monitor" "redis_connection" {
no_data_timeframe = 20 no_data_timeframe = 20
} }
resource "datadog_monitor" "redis_eviction" { resource "datadog_monitor" "redis_eviction" {
name = "Redis eviction > 0" name = "Redis eviction > 0"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "min(last_5m):avg:redis.keys.evicted{*} > 0" query = "min(last_5m):avg:redis.keys.evicted{*} > 0"
type = "metric alert" type = "metric alert"
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
} }
resource "datadog_monitor" "datadog_blocked_client" { resource "datadog_monitor" "datadog_blocked_client" {
name = "Redis blocked clients > 0" name = "Redis blocked clients > 0"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "min(last_5m):avg:redis.clients.blocked{*} > 0" query = "min(last_5m):avg:redis.clients.blocked{*} > 0"
type = "metric alert" type = "metric alert"
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
} }
resource "datadog_monitor" "redis_swap" { resource "datadog_monitor" "redis_swap" {
name = "Redis begin to swap" name = "Redis begin to swap"
message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8" query = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8"
type = "metric alert" type = "metric alert"
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
notify_no_data = false
renotify_interval = 60
notify_audit = false
timeout_h = 0
include_tags = true
locked = false
require_full_window = true
new_host_delay = 300
notify_no_data = false
renotify_interval = 0
no_data_timeframe = 20
} }

View File

@ -96,6 +96,7 @@ variable "elb_4xx_threshold" {
variable "elb_backend_latency" { variable "elb_backend_latency" {
description = "Average time elapsed after the request leaves the load balancer until a response is received. In seconds" description = "Average time elapsed after the request leaves the load balancer until a response is received. In seconds"
default = { default = {
warning = 1 warning = 1
critical = 5 critical = 5

7
main.tf.ci Normal file
View File

@ -0,0 +1,7 @@
variable "aws_region" {}
provider "aws" {
version = "1.2.0"
region = "${var.aws_region}"
}

View File

@ -54,7 +54,7 @@ resource "datadog_monitor" "datadog_free_disk_space_5" {
name = "[${var.env}] Free disk space < 5% on {{host.name}}" name = "[${var.env}] Free disk space < 5% on {{host.name}}"
message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}" message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} * 100 < 5" query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,env:${var.env},!dd_custom_cpu:enabled} by {host,region,device} * 100 < 5"
type = "query alert" type = "query alert"
count = "${var.dd_linux_basics == "enabled" ? 1 : 0}" count = "${var.dd_linux_basics == "enabled" ? 1 : 0}"

6
terraform.tfvars.ci Normal file
View File

@ -0,0 +1,6 @@
aws_region="eu-west-1"
region="eu-west-1"
env="test"
hno_escalation_group="abc"
ho_escalation_group="abc"