diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..1565be3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.tfstate +*.tfstate.backup +.terraform +provider.tf +*.tfvars +**/*.tfvars +provider.tf +.github +.circleci diff --git a/README.md b/README.md new file mode 100644 index 0000000..4c1f65c --- /dev/null +++ b/README.md @@ -0,0 +1,166 @@ +# Terraform Datadog Synthetics Module + +## Overview + +This Terraform module creates synthetic API tests and associated monitors for endpoint health checking in Datadog. + +## Features + +- **HTTP API Health Checks**: Automated endpoint monitoring +- **Multi-Region Testing**: Tests from AWS regions globally +- **Pod Health Monitoring**: Kubernetes container monitoring +- **15-Minute Intervals**: Regular health check execution +- **Live Status**: Tests are active immediately upon creation + +## Resources Created + +- `datadog_synthetics_test` (beacon): HTTP API health check +- `datadog_monitor` (beacon): Kubernetes Pod Health metric alert + +## Requirements + +| Name | Version | +|------|---------| +| terraform | >= 0.12 | +| datadog | >= 2.0 | + +## Usage + +```hcl +module "synthetics" { + source = "./terraform-datadog-synthetics" + + datadog_api_key = var.datadog_api_key + datadog_app_key = var.datadog_app_key + region = "eu-west-1" + url = "https://api.example.com/health" + + dd_synthetics = { + name = "API Health Check" + tags = ["env:production", "service:api"] + } + + notify = { + alert_recipients = ["team@example.com"] + } +} +``` + +## Inputs + +| Name | Description | Type | Required | Default | +|------|-------------|------|----------|---------| +| `datadog_api_key` | Datadog API key | `string` | yes | - | +| `datadog_app_key` | Datadog APP key | `string` | yes | - | +| `dd_synthetics` | Synthetics configuration | `any` | yes | - | +| `notify` | Notification configuration | `any` | yes | - | +| `region` | AWS region | `string` | no | `"eu-west-1"` | +| `url` | URL to monitor | `string` | yes | - | + +## Synthetic Test Configuration + +### HTTP API Test + +- **Type**: API test (HTTP subtype) +- **Method**: GET +- **Interval**: Every 15 minutes (900 seconds) +- **Assertion**: HTTP status code equals 200 +- **Locations**: AWS regions (EU and US) +- **Failure Threshold**: 1 location +- **Status**: Live (active immediately) + +### Test Locations + +The synthetic test runs from multiple AWS regions: +- EU regions for European coverage +- US regions for American coverage + +## Pod Health Monitor + +### Configuration + +- **Query**: `docker.containers.running <= 1` +- **Type**: Metric alert +- **Thresholds**: + - OK: 3 containers running + - Warning: 2 containers running + - Critical: 1 or fewer containers running +- **Evaluation**: Last 5 minutes average + +## Outputs + +Currently, this module does not export any outputs. + +## Synthetic Test Features + +### Assertions + +- Validates HTTP response status code is 200 +- Can be extended with additional assertions: + - Response time checks + - Response body validation + - Header validation + +### Locations + +Tests execute from AWS-based synthetic locations for: +- Low latency testing +- Geographic distribution +- Realistic user perspective + +### Timing + +- **Tick Interval**: 900 seconds (15 minutes) +- **Min Failure**: 1 location must fail to trigger alert +- **Min Location Success**: Not specified (uses default) + +## Notifications + +Configure notifications through the `notify` variable: +```hcl +notify = { + alert_recipients = ["critical@example.com"] + warning_recipients = ["warnings@example.com"] + message = "Custom alert message" +} +``` + +## Use Cases + +- **API Health Monitoring**: Ensure endpoints are responding +- **SLA Compliance**: Track uptime and availability +- **Geographic Testing**: Validate performance from multiple regions +- **Container Health**: Monitor Kubernetes pod availability + +## Notes + +- Tests start in "live" status immediately +- Minimum 1 location failure required to trigger alert +- Pod health monitor uses 5-minute evaluation window +- Synthetic tests count towards Datadog billing +- Configure appropriate tick intervals to balance cost and monitoring frequency + +## Extension Ideas + +Add additional assertions: +```hcl +assertion { + type = "responseTime" + operator = "lessThan" + target = 2000 +} + +assertion { + type = "body" + operator = "contains" + target = "healthy" +} +``` + +## License + +Internal use only - Sanoma/WeBuildYourCloud + +## Authors + +Created and maintained by the Platform Engineering team. diff --git a/data.tf b/data.tf old mode 100644 new mode 100755 diff --git a/main.tf b/main.tf old mode 100644 new mode 100755 diff --git a/monitor.tf b/monitor.tf new file mode 100755 index 0000000..05b510c --- /dev/null +++ b/monitor.tf @@ -0,0 +1,19 @@ +resource "datadog_monitor" "beacon" { + name = "Kubernetes Pod Health" + type = "metric alert" + message = "Kubernetes Pods are not in an optimal health state. Notify: @operator" + escalation_message = "Please investigate the Kubernetes Pods, @operator" + + query = "max(last_1m):sum:docker.containers.running{short_image:${var.app_name}} <= 1" + + monitor_thresholds { + ok = 3 + warning = 2 + critical = 1 + } + + notify_no_data = true + + tags = ["app:${var.app_name}", "env:${var.environment}"] +} + diff --git a/outputs.tf b/outputs.tf old mode 100644 new mode 100755 diff --git a/provider.tf b/provider.tf old mode 100644 new mode 100755 diff --git a/synthetics.tf b/synthetics.tf new file mode 100755 index 0000000..da13592 --- /dev/null +++ b/synthetics.tf @@ -0,0 +1,27 @@ +resource "datadog_synthetics_test" "beacon" { + type = "api" + subtype = "http" + + request_definition { + method = "GET" + url = "http://${var.url}" + } + + assertion { + type = "statusCode" + operator = "is" + target = "200" + } + + locations = ["aws:${var.region}"] + options_list { + tick_every = 900 + min_location_failed = 1 + } + + name = "${var.app_name} API Check" + message = "Oh no! Something is going wrong, please investigate!" + tags = ["app:${var.app_name}", "env:${var.environment}"] + + status = "live" +} diff --git a/terraform.tfvars b/terraform.tfvars old mode 100644 new mode 100755 diff --git a/variables.tf b/variables.tf old mode 100644 new mode 100755 index 7f948c7..1d603eb --- a/variables.tf +++ b/variables.tf @@ -21,3 +21,7 @@ variable "notify" {} variable "region" { default = "eu-west-1" } + +variable "url" { + description = "Specifies the URL for datadog to monitor" +}