Merged in TER-209-add-bitbucket-pipelines-to-suppo (pull request #40)

TER-209 add bitbucket pipelines to suppo Approved-by: Adrien Bréfort <adrien.brefort@fr.clara.net>
2018-01-23 14:10:03 +00:00 · 2018-01-23 14:10:03 +00:00 · 37d83aa0f2
commit 37d83aa0f2
parent 81157b8da2 eca7bb5272
23 changed files with 231 additions and 202 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,3 @@
-# Ignore all volatile files
-**/.terraform/modules
-**/.terraform/plugins
-**/terraform.tfstate*.backup
-
-# Ignore all credentials files
-**/terraform.tfvars
-
-# Ignore all but root state files
-**/terraform.tfstate
+.terraform
+main.tf
+terraform.tfvars
--- a/bitbucket-pipelines.yml
+++ b/bitbucket-pipelines.yml
@ -0,0 +1,15 @@
+image: hashicorp/terraform:0.10.8
+
+pipelines:
+  default:
+    - step:
+        name: Format
+        script:
+          - terraform fmt -write=false -diff -check
+    - step:
+        name: Validate
+        script:
+          - mv main.tf.ci main.tf
+          - mv terraform.tfvars.ci terraform.tfvars
+          - terraform init
+          - terraform validate
--- a/cloud/aws/elasticsearch/monitors-elasticsearch.tf
+++ b/cloud/aws/elasticsearch/monitors-elasticsearch.tf
@ -12,6 +12,7 @@ resource "datadog_monitor" "es_cluster_status" {
  message = "${var.message}"

  type = "query alert"
+
  query = <<EOF
  max(last_30m): (
    avg:aws.es.cluster_statusred{${data.template_file.filter.rendered}} by {region,name} * 2 +
@ -45,6 +46,7 @@ resource "datadog_monitor" "es_free_space_low" {
  message = "${var.message}"

  type = "query alert"
+
  query = <<EOF
  avg(last_15m): (
    avg:aws.es.free_storage_space{${data.template_file.filter.rendered}} by {region,name} / (${var.es_cluster_volume_size}*1000) * 100
@ -76,6 +78,7 @@ resource "datadog_monitor" "es_cpu_90_15min" {
  message = "${var.message}"

  type = "query alert"
+
  query = <<EOF
  avg(last_15m): (
    avg:aws.es.cpuutilization{${data.template_file.filter.rendered}} by {region,name}
--- a/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
+++ b/cloud/azure/apimanagement/monitors-azure-apimanagement.tf
@ -15,6 +15,7 @@ resource "datadog_monitor" "apimgt_status" {
  query = <<EOF
      avg(last_5m):avg:azure.apimanagement_service.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
      EOF
+
  type = "metric alert"

  thresholds {
--- a/cloud/azure/eventhub/monitors-eventhub.tf
+++ b/cloud/azure/eventhub/monitors-eventhub.tf
@ -13,6 +13,7 @@ resource "datadog_monitor" "eventhub_status" {
  query = <<EOF
      avg(last_5m): avg:azure.eventhub_namespaces.status{${data.template_file.filter.rendered}} by {resource_group,region,name} != 1
      EOF
+
  type = "metric alert"

  notify_no_data      = true
@ -41,6 +42,7 @@ resource "datadog_monitor" "eventhub_failed_requests" {
          avg:azure.eventhub_namespaces.failed_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
        ) * 100 > ${var.failed_requests_rate_thresold_critical}
        EOF
+
  type = "metric alert"

  thresholds {
@ -78,6 +80,7 @@ resource "datadog_monitor" "eventhub_errors" {
          avg:azure.eventhub_namespaces.other_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
        ) * 100 > ${var.errors_rate_thresold_critical}
        EOF
+
  type = "metric alert"

  thresholds {
--- a/cloud/azure/storage/inputs.tf
+++ b/cloud/azure/storage/inputs.tf
@ -69,4 +69,3 @@ variable "authorization_error_requests_threshold_critical" {
  description = "Maximum acceptable percent of authorization error requests for a storage"
  default     = 15
 }
-
--- a/cloud/azure/storage/monitors-azure-storage.tf
+++ b/cloud/azure/storage/monitors-azure-storage.tf
@ -122,7 +122,6 @@ EOF
  tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
 }

-
 resource "datadog_monitor" "network_error_requests" {
  name    = "[${var.environment}] Azure Storage {{value}}% of network error requests on {{name}}"
  message = "${var.message}"
@ -152,7 +151,6 @@ EOF
  tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
 }

-
 resource "datadog_monitor" "throttling_error_requests" {
  name    = "[${var.environment}] Azure Storage {{value}}% of throttling error requests on {{name}}"
  message = "${var.message}"
@ -182,7 +180,6 @@ EOF
  tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
 }

-
 resource "datadog_monitor" "server_other_error_requests" {
  name    = "[${var.environment}] Azure Storage {{value}}% of server_other error requests on {{name}}"
  message = "${var.message}"
@ -212,7 +209,6 @@ EOF
  tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
 }

-
 resource "datadog_monitor" "client_other_error_requests" {
  name    = "[${var.environment}] Azure Storage {{value}}% of client_other error requests on {{name}}"
  message = "${var.message}"
@ -242,7 +238,6 @@ EOF
  tags = ["env:${var.environment}", "resource:storage", "team:azure", "provider:azure"]
 }

-
 resource "datadog_monitor" "authorization_error_requests" {
  name    = "[${var.environment}] Azure Storage {{value}}% of authorization error requests on {{name}}"
  message = "${var.message}"
--- a/cloud/azure/stream-analytics/monitors-stream-analytics.tf
+++ b/cloud/azure/stream-analytics/monitors-stream-analytics.tf
@ -13,6 +13,7 @@ resource "datadog_monitor" "status" {
  query = <<EOF
    avg(last_5m):avg:azure.streamanalytics_streamingjobs.status{${data.template_file.filter.rendered}} by {resource_group,region,name} < 1
  EOF
+
  type = "metric alert"

  notify_no_data      = true
@ -38,6 +39,7 @@ resource "datadog_monitor" "su_utilization" {
      avg:azure.streamanalytics_streamingjobs.resource_utilization{${data.template_file.filter.rendered}} by {resource_group,region,name}
    ) > ${var.su_utilization_threshold_critical}
  EOF
+
  type = "metric alert"

  notify_no_data      = false
@ -50,6 +52,7 @@ resource "datadog_monitor" "su_utilization" {
  require_full_window = true
  new_host_delay      = "${var.delay}"
  no_data_timeframe   = 20
+
  thresholds {
    warning  = "${var.su_utilization_threshold_warning}"
    critical = "${var.su_utilization_threshold_critical}"
@ -68,6 +71,7 @@ resource "datadog_monitor" "failed_function_requests" {
       avg:azure.streamanalytics_streamingjobs.aml_callout_requests{${data.template_file.filter.rendered}} by {resource_group,region,name}.as_count()
    ) * 100 > ${var.failed_function_requests_threshold_critical}
  EOF
+
  type = "metric alert"

  notify_no_data      = false
@ -80,6 +84,7 @@ resource "datadog_monitor" "failed_function_requests" {
  require_full_window = true
  new_host_delay      = "${var.delay}"
  no_data_timeframe   = 20
+
  thresholds {
    warning  = "${var.function_requests_threshold_warning}"
    critical = "${var.failed_function_requests_threshold_critical}"
@ -97,6 +102,7 @@ resource "datadog_monitor" "conversion_errors" {
      avg:azure.streamanalytics_streamingjobs.conversion_errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
    ) > ${var.conversion_errors_threshold_critical}
  EOF
+
  type = "metric alert"

  notify_no_data      = false
@ -109,6 +115,7 @@ resource "datadog_monitor" "conversion_errors" {
  require_full_window = true
  new_host_delay      = "${var.delay}"
  no_data_timeframe   = 20
+
  thresholds {
    warning  = "${var.conversion_errors_threshold_warning}"
    critical = "${var.conversion_errors_threshold_critical}"
@ -126,6 +133,7 @@ resource "datadog_monitor" "runtime_errors" {
      avg:azure.streamanalytics_streamingjobs.errors{${data.template_file.filter.rendered}} by {resource_group,region,name}
    ) > ${var.runtime_errors_threshold_critical}
  EOF
+
  type = "metric alert"

  notify_no_data      = false
@ -138,6 +146,7 @@ resource "datadog_monitor" "runtime_errors" {
  require_full_window = true
  new_host_delay      = "${var.delay}"
  no_data_timeframe   = 20
+
  thresholds {
    warning  = "${var.runtime_errors_threshold_warning}"
    critical = "${var.runtime_errors_threshold_critical}"
--- a/incubator/monitors-gcp-cloud-sql.tf
+++ b/incubator/monitors-gcp-cloud-sql.tf
@ -42,10 +42,10 @@ resource "datadog_monitor" "cloud_sql_disk_space" {
 }

 resource "datadog_monitor" "cloud_sql_connection_80" {
-name   = "Cloud SQL MySQL connection > 80% of max connections"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500"
-type  = "metric alert"
+  name    = "Cloud SQL MySQL connection > 80% of max connections"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "avg(last_5m):avg:gcp.cloudsql.database.network.connections{*} > 3500"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -61,10 +61,10 @@ type  = "metric alert"
 }

 resource "datadog_monitor" "cloud_sql_lag" {
-name   = "Cloud SQL MySQL lag > 45min"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700"
-type  = "metric alert"
+  name    = "Cloud SQL MySQL lag > 45min"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "min(last_10m):avg:gcp.cloudsql.database.mysql.replication.seconds_behind_master{*} by {database_id} > 2700"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -77,14 +77,13 @@ type  = "metric alert"
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-
 }

 resource "datadog_monitor" "cloud_sql_replication" {
-name   = "Cloud SQL Failover not ready to replication"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0"
-type  = "metric alert"
+  name    = "Cloud SQL Failover not ready to replication"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "max(last_5m):avg:gcp.cloudsql.database.mysql.replication.available_for_failover{*} <= 0"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -97,5 +96,4 @@ type  = "metric alert"
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-
 }
--- a/incubator/monitors-kubernetes.tf
+++ b/incubator/monitors-kubernetes.tf
@ -48,7 +48,9 @@ resource "datadog_monitor" "kubernetes_kubelet_check" {

 resource "datadog_monitor" "kubernetes_kubelet_ping" {
  name    = "Kubernetes kubelet ping not ok"
-  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
+
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
  query   = "\"kubernetes.kubelet.check.ping\".over(\"goog-gke-node\").by(\"*\").last(1).pct_by_status()"

  thresholds {
@ -108,7 +110,7 @@ resource "datadog_monitor" "kubernetes_node_status" {
  no_data_timeframe   = 20
 }

-  type  = "query alert"
+/*  type  = "query alert"

  thresholds {
 #    warning  = 75
@ -126,4 +128,5 @@ resource "datadog_monitor" "kubernetes_node_status" {
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-}
+}*/
+
--- a/incubator/monitors-redis-containers.tf
+++ b/incubator/monitors-redis-containers.tf
@ -1,11 +1,12 @@
 resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" {
  name = "Kubernetes Redis container CPU High > 95% for 5 min"
+
  #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
  query   = "avg(last_5m):avg:gcp.container.cpu.utilization{container_name:redis} by {cluster-name} * 100 > 95"

  thresholds {
-#    warning  = 80
+    #    warning  = 80
    critical = 95
  }

@ -25,6 +26,7 @@ resource "datadog_monitor" "kubernetes_redis_cpu_95_5min" {

 resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
  name = "Kubernetes Redis container CPU High > 80% for 15 min"
+
  #message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}\n{{#is_warning}}\n${var.warning_HO} \n{{/is_warning}} \n{{#is_warning_recovery}}\n${var.warning_HO}\n{{/is_warning_recovery}}"
  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"

@ -32,7 +34,7 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
  type  = "query alert"

  thresholds {
-#    warning  = 75
+    #    warning  = 75
    critical = 80
  }

@ -72,3 +74,4 @@ resource "datadog_monitor" "kubernetes_redis_cpu_80_15min" {
 #   renotify_interval   = 0
 #   no_data_timeframe   = 20
 # }
+
--- a/incubator/monitors-redis.tf
+++ b/incubator/monitors-redis.tf
@ -22,13 +22,11 @@ resource "datadog_monitor" "redis_connection" {
  no_data_timeframe   = 20
 }

-
-
 resource "datadog_monitor" "redis_eviction" {
-name   = "Redis eviction > 0"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "min(last_5m):avg:redis.keys.evicted{*} > 0"
-type  = "metric alert"
+  name    = "Redis eviction > 0"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "min(last_5m):avg:redis.keys.evicted{*} > 0"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -41,15 +39,13 @@ type  = "metric alert"
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-
 }

-
 resource "datadog_monitor" "datadog_blocked_client" {
-name   = "Redis blocked clients > 0"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "min(last_5m):avg:redis.clients.blocked{*} > 0"
-type  = "metric alert"
+  name    = "Redis blocked clients > 0"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "min(last_5m):avg:redis.clients.blocked{*} > 0"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -62,15 +58,13 @@ type  = "metric alert"
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-
 }

-
 resource "datadog_monitor" "redis_swap" {
-name   = "Redis begin to swap"
-message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
-query   = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8"
-type  = "metric alert"
+  name    = "Redis begin to swap"
+  message = "{{#is_alert}}\n${var.alert_HNO} \n{{/is_alert}} \n{{#is_recovery}}\n${var.alert_HNO}\n{{/is_recovery}}"
+  query   = "avg(last_5m):avg:redis.mem.fragmentation_ratio{*} <= 0.8"
+  type    = "metric alert"

  notify_no_data      = false
  renotify_interval   = 60
@ -83,5 +77,4 @@ type  = "metric alert"
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
-
 }
--- a/inputs.tf
+++ b/inputs.tf
@ -96,6 +96,7 @@ variable "elb_4xx_threshold" {

 variable "elb_backend_latency" {
  description = "Average time elapsed after the request leaves the load balancer until a response is received. In seconds"
+
  default = {
    warning  = 1
    critical = 5
--- a/main.tf.ci
+++ b/main.tf.ci
@ -0,0 +1,7 @@
+variable "aws_region" {}
+
+provider "aws" {
+  version = "1.2.0"
+
+  region = "${var.aws_region}"
+}
--- a/terraform.tfvars.ci
+++ b/terraform.tfvars.ci
@ -0,0 +1,6 @@
+aws_region="eu-west-1"
+region="eu-west-1"
+env="test"
+hno_escalation_group="abc"
+ho_escalation_group="abc"
+