From 44e2f267a0dc955a03a7adeb83b14870b81aeac6 Mon Sep 17 00:00:00 2001 From: Patrick de Ruiter Date: Sat, 1 Nov 2025 10:19:32 +0100 Subject: [PATCH] Add comprehensive README and update module documentation --- .gitignore | 9 +++ README.md | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++ context.tf | 184 ++++++++++++++++++++++++++++++++++++++++++++ main.tf | 9 +++ monitors.tf | 137 +++++++++++++++++++++++++++++++++ outputs.tf | 16 ++++ variables.tf | 33 ++++++++ versions.tf | 18 +++++ 8 files changed, 617 insertions(+) create mode 100755 .gitignore create mode 100644 README.md create mode 100755 context.tf create mode 100755 main.tf create mode 100755 monitors.tf create mode 100755 outputs.tf create mode 100755 variables.tf create mode 100755 versions.tf diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..1565be3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.tfstate +*.tfstate.backup +.terraform +provider.tf +*.tfvars +**/*.tfvars +provider.tf +.github +.circleci diff --git a/README.md b/README.md new file mode 100644 index 0000000..7f35bdc --- /dev/null +++ b/README.md @@ -0,0 +1,211 @@ +# Terraform AWS Datadog2 Integration & Monitoring Module + +## Overview + +The `terraform-aws-datadog2` module is a comprehensive Terraform configuration that integrates AWS with Datadog for monitoring and alerting. It sets up AWS-Datadog integration and creates pre-configured Datadog monitors to track critical infrastructure metrics. + +## Features + +- Automated AWS-Datadog integration setup +- Pre-configured infrastructure monitors for: + - CPU utilization + - Memory utilization + - System load + - Disk space + - Disk inodes + - Disk usage forecasting (7-day prediction) +- CloudPosse label/tagging context for consistent naming +- Support for both EU and US Datadog endpoints + +## Resources Created + +### AWS Resources (via CloudPosse Module) +- **IAM Role** - Allows Datadog to assume this role for monitoring AWS resources +- **External ID** - Security mechanism for cross-account role assumption +- Associated IAM policies for AWS monitoring permissions + +### Datadog Monitors + +1. **CPU Utilization Monitor** + - Type: Metric alert + - Warning: 50% + - Critical: 60% + +2. **Memory Utilization Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 10% usable memory remaining + - Critical: 5% usable memory remaining + +3. **System Load Monitor** + - Type: Query alert + - Tracks: 5-minute normalized system load + - Evaluation: 30 minutes + - Warning: 2.0 + - Critical: 2.5 + +4. **Disk Space Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 80% used + - Critical: 90% used + +5. **Disk Inodes Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 90% used + - Critical: 95% used + +6. **Disk Usage Forecast Monitor** + - Type: Query alert with forecasting + - Prediction: Next 7 days + - Forecast model: Linear + - Warning: 72% predicted usage + - Critical: 80% predicted usage + +## Usage + +```hcl +module "datadog_monitoring" { + source = "path/to/terraform-aws-datadog2" + + # Required variables + region = "eu-west-1" + api_key = var.datadog_api_key # Store securely! + app_key = var.datadog_app_key # Store securely! + aws_profile = "your-aws-profile" + prefix_slug = "mycompany" + team = "platform" + + # Optional variables + datadog_site = "https://api.datadoghq.eu/" # Default + + # CloudPosse label context (optional) + namespace = "myorg" + environment = "prod" + stage = "production" + name = "monitoring" + + tags = { + Project = "Infrastructure" + ManagedBy = "Terraform" + } +} +``` + +## Variables + +### Required Variables + +| Variable | Type | Description | +|----------|------|-------------| +| `region` | string | AWS region where monitored resources reside | +| `api_key` | string | Datadog API key for sending logs, metrics, and traces | +| `app_key` | string | Datadog application key for API manipulation | +| `aws_profile` | string | AWS profile name for authentication | +| `prefix_slug` | string | Prefix slug for naming | +| `team` | string | Team identifier | + +### Optional Variables + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `datadog_site` | string | `https://api.datadoghq.eu/` | Datadog site endpoint | + +### CloudPosse Label Context Variables + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `enabled` | bool | null | Enable/disable resource creation | +| `namespace` | string | null | Organization name or abbreviation | +| `environment` | string | null | Environment identifier | +| `stage` | string | null | Stage identifier | +| `name` | string | null | Solution name | +| `delimiter` | string | null | Delimiter between name components | +| `attributes` | list(string) | [] | Additional attributes for naming | +| `tags` | map(string) | {} | Additional tags | +| `label_order` | list(string) | null | Custom ordering of name components | + +## Outputs + +| Output | Description | +|--------|-------------| +| `aws_account_id` | AWS Account ID of the IAM Role for Datadog | +| `aws_role_name` | Name of the AWS IAM Role for Datadog | +| `datadog_external_id` | External ID for secure role assumption | + +**Note:** These outputs are essential for completing the Datadog integration by providing values to enter in Datadog's AWS integration settings. + +## Dependencies + +### Terraform Requirements +- Terraform >= 0.13.0 + +### Provider Requirements +- `hashicorp/aws` - AWS infrastructure management +- `datadog/datadog` - Datadog monitoring resources +- `hashicorp/local` >= 1.3 - Local file operations + +### External Modules +1. **cloudposse/datadog-integration/aws** (v0.11.0) + - Creates AWS IAM role and permissions for Datadog + - Handles cross-account role assumption + +2. **cloudposse/label/null** (v0.24.1) + - Provides consistent tagging and naming conventions + +### Prerequisites +- Valid AWS account with IAM role creation permissions +- Active Datadog account with monitor creation access +- Network connectivity to AWS and Datadog APIs +- Proper AWS profile configured + +## Post-Deployment Setup + +After applying this module, complete the integration in Datadog: + +1. Navigate to AWS integration settings in Datadog console +2. Add AWS account using the `aws_account_id` output +3. Add the `aws_role_name` as the IAM role name +4. Provide the `datadog_external_id` as the external ID +5. Complete the AWS integration in Datadog console + +## Monitor Alert Notifications + +To receive alerts, configure notification channels in Datadog and update the monitors to include your notification preferences. + +## Customization + +### Adjusting Monitor Thresholds + +To adjust alert thresholds, modify the monitor resources in `monitors.tf`: + +```hcl +# Example: Adjust CPU warning to 60% and critical to 80% +resource "datadog_monitor" "cpumonitor" { + # ... other settings ... + thresholds = { + warning = 60 + critical = 80 + } +} +``` + +### Adding Additional Monitors + +Add new monitor resources to `monitors.tf` following the existing patterns. + +## Security Considerations + +- Store API keys and app keys securely (use Terraform Cloud, AWS Secrets Manager, or HashiCorp Vault) +- Never commit sensitive credentials to version control +- Use IAM role-based access instead of IAM user credentials where possible +- Review and adjust monitor thresholds based on your workload requirements + +## License + +See project license file. + +## Authors + +Maintained by WebBuildYourCloud team. diff --git a/context.tf b/context.tf new file mode 100755 index 0000000..307f711 --- /dev/null +++ b/context.tf @@ -0,0 +1,184 @@ +module "this" { + source = "cloudposse/label/null" + version = "0.24.1" # requires Terraform >= 0.13.0 + + enabled = var.enabled + namespace = var.namespace + environment = var.environment + stage = var.stage + name = var.name + delimiter = var.delimiter + attributes = var.attributes + tags = var.tags + additional_tag_map = var.additional_tag_map + label_order = var.label_order + regex_replace_chars = var.regex_replace_chars + id_length_limit = var.id_length_limit + label_key_case = var.label_key_case + label_value_case = var.label_value_case + + context = var.context +} + +# Copy contents of cloudposse/terraform-null-label/variables.tf here + +variable "context" { + type = any + default = { + enabled = true + namespace = null + environment = null + stage = null + name = null + delimiter = null + attributes = [] + tags = {} + additional_tag_map = {} + regex_replace_chars = null + label_order = [] + id_length_limit = null + label_key_case = null + label_value_case = null + } + description = <<-EOT + Single object for setting entire context at once. + See description of individual variables for details. + Leave string and numeric variables as `null` to use default value. + Individual variable settings (non-null) override settings in context object, + except for attributes, tags, and additional_tag_map, which are merged. + EOT + + validation { + condition = lookup(var.context, "label_key_case", null) == null ? true : contains(["lower", "title", "upper"], var.context["label_key_case"]) + error_message = "Allowed values: `lower`, `title`, `upper`." + } + + validation { + condition = lookup(var.context, "label_value_case", null) == null ? true : contains(["lower", "title", "upper", "none"], var.context["label_value_case"]) + error_message = "Allowed values: `lower`, `title`, `upper`, `none`." + } +} + +variable "enabled" { + type = bool + default = null + description = "Set to false to prevent the module from creating any resources" +} + +variable "namespace" { + type = string + default = null + description = "Namespace, which could be your organization name or abbreviation, e.g. 'eg' or 'cp'" +} + +variable "environment" { + type = string + default = null + description = "Environment, e.g. 'uw2', 'us-west-2', OR 'prod', 'staging', 'dev', 'UAT'" +} + +variable "stage" { + type = string + default = null + description = "Stage, e.g. 'prod', 'staging', 'dev', OR 'source', 'build', 'test', 'deploy', 'release'" +} + +variable "name" { + type = string + default = null + description = "Solution name, e.g. 'app' or 'jenkins'" +} + +variable "delimiter" { + type = string + default = null + description = <<-EOT + Delimiter to be used between `namespace`, `environment`, `stage`, `name` and `attributes`. + Defaults to `-` (hyphen). Set to `""` to use no delimiter at all. + EOT +} + +variable "attributes" { + type = list(string) + default = [] + description = "Additional attributes (e.g. `1`)" +} + +variable "tags" { + type = map(string) + default = {} + description = "Additional tags (e.g. `map('BusinessUnit','XYZ')`" +} + +variable "additional_tag_map" { + type = map(string) + default = {} + description = "Additional tags for appending to tags_as_list_of_maps. Not added to `tags`." +} + +variable "label_order" { + type = list(string) + default = null + description = <<-EOT + The naming order of the id output and Name tag. + Defaults to ["namespace", "environment", "stage", "name", "attributes"]. + You can omit any of the 5 elements, but at least one must be present. + EOT +} + +variable "regex_replace_chars" { + type = string + default = null + description = <<-EOT + Regex to replace chars with empty string in `namespace`, `environment`, `stage` and `name`. + If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. + EOT +} + +variable "id_length_limit" { + type = number + default = null + description = <<-EOT + Limit `id` to this many characters (minimum 6). + Set to `0` for unlimited length. + Set to `null` for default, which is `0`. + Does not affect `id_full`. + EOT + validation { + condition = var.id_length_limit == null ? true : var.id_length_limit >= 6 || var.id_length_limit == 0 + error_message = "The id_length_limit must be >= 6 if supplied (not null), or 0 for unlimited length." + } +} + +variable "label_key_case" { + type = string + default = null + description = <<-EOT + The letter case of label keys (`tag` names) (i.e. `name`, `namespace`, `environment`, `stage`, `attributes`) to use in `tags`. + Possible values: `lower`, `title`, `upper`. + Default value: `title`. + EOT + + validation { + condition = var.label_key_case == null ? true : contains(["lower", "title", "upper"], var.label_key_case) + error_message = "Allowed values: `lower`, `title`, `upper`." + } +} + +variable "label_value_case" { + type = string + default = null + description = <<-EOT + The letter case of output label values (also used in `tags` and `id`). + Possible values: `lower`, `title`, `upper` and `none` (no transformation). + Default value: `lower`. + EOT + + validation { + condition = var.label_value_case == null ? true : contains(["lower", "title", "upper", "none"], var.label_value_case) + error_message = "Allowed values: `lower`, `title`, `upper`, `none`." + } +} +#### End of copy of cloudposse/terraform-null-label/variables.tf + + diff --git a/main.tf b/main.tf new file mode 100755 index 0000000..51b5bf9 --- /dev/null +++ b/main.tf @@ -0,0 +1,9 @@ +module "datadog_integration" { + source = "cloudposse/datadog-integration/aws" + version = "0.11.0" + namespace = "plaap" + stage = "test" + name = "datadog-integration" + integrations = [ "all" ] +} + diff --git a/monitors.tf b/monitors.tf new file mode 100755 index 0000000..7e82a36 --- /dev/null +++ b/monitors.tf @@ -0,0 +1,137 @@ +# Monitor CPU Utilisation +resource "datadog_monitor" "cpumonitor" { + name = "cpu monitor" + type = "metric alert" + message = "CPU usage alert" + query = "avg(last_1m):avg:system.cpu.system{*} by {host} > 60" + monitor_thresholds { + ok = 20 + warning = 50 + critical = 60 + } +} + +# Monitor Memory Utilisation +resource "datadog_monitor" "memorymonitor" { + name = "Usable Memory" + type = "query alert" + evaluation_delay = "15" + include_tags = true + locked = false + message = "Plaaper de plaaper de plaap" + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + #notify_no_data = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + timeout_h = 0 + query = "max(last_5m):avg:system.mem.usable{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" + + monitor_thresholds { + critical = 5 + warning = 10 + } +} + +# Monitor System Load +resource "datadog_monitor" "systemload" { + name = "System Load" + type = "query alert" + evaluation_delay = "15" + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = false + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "System Load exceeding set Thresholg, please investigate" + query = "min(last_30m):( avg:system.load.norm.5{*} by {host} ) > 2.5" + + monitor_thresholds { + critical = "2.5" + warning = "2" + } + +} +# Monitor Disk Utilisation +resource "datadog_monitor" "disk_usage" { + name = "Disk Space" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Usage is exceeding set threshold, please investigate" + query = "max(last_5m):avg:system.disk.in_use{*} by {host,device} * 100 > 90" + + monitor_thresholds { + critical = "90" + warning = "80" + } +} + +# Monitor Disk Inode Usage +resource "datadog_monitor" "disk_inodes" { + name = "Disk Inodes Usage" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Inode Usage is exceeding set threshold, please investigate" + query = "min(last_5m):avg:system.fs.inodes.in_use{*} by {host,device} * 100 > 95" + + monitor_thresholds { + critical = "95" + warning = "90" + } +} + +# Monitor Disk Space Forecast +resource "datadog_monitor" "disk_forecast" { + name = "Disk Usage Forecast" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Usage is exceeding set threshold, please investigate" + query = "max(next_1w):forecast(avg:system.disk.in_use{*} by {host,device} * 100, 'linear', 1, interval='60m', history='1w', model='default') >= 80" + + monitor_thresholds { + critical = "80" + warning = "72" + } +} \ No newline at end of file diff --git a/outputs.tf b/outputs.tf new file mode 100755 index 0000000..091d06e --- /dev/null +++ b/outputs.tf @@ -0,0 +1,16 @@ +output "aws_account_id" { + value = module.datadog_integration.aws_account_id + description = "AWS Account ID of the IAM Role for Datadog to use for this integration" +} + +output "aws_role_name" { + value = module.datadog_integration.aws_role_name + description = "Name of the AWS IAM Role for Datadog to use for this integration" +} + +output "datadog_external_id" { + value = module.datadog_integration.datadog_external_id + description = "Datadog integration external ID" +} + + diff --git a/variables.tf b/variables.tf new file mode 100755 index 0000000..af04660 --- /dev/null +++ b/variables.tf @@ -0,0 +1,33 @@ +variable "region" { + type = string + description = "The AWS region that the resources to be monitored reside in" +} + +variable "api_key" { + type = string + description = "The api_key that is used to send logs, metrics and traces to the datadog account" +} + +variable "app_key" { + type = string + description = "The app_key that is used to manipulate the datadog API" +} + +variable "datadog_site" { + type = string + description = "Datadog site to connect to (EU or US)" + default = "https://api.datadoghq.eu/" +} + +variable "aws_profile" { + type = string + description = "Which AWS account is this for" +} + +variable "prefix_slug" { + type = string +} + +variable "team" { + type = string +} diff --git a/versions.tf b/versions.tf new file mode 100755 index 0000000..6e26982 --- /dev/null +++ b/versions.tf @@ -0,0 +1,18 @@ +terraform { + required_version = ">= 0.13.0" + + required_providers { + aws = { + source = "hashicorp/aws" + #version = ">= 2.0" + } + local = { + source = "hashicorp/local" + version = ">= 1.3" + } + datadog = { + source = "datadog/datadog" + #version = ">= 2.12" + } + } +}