commit 44e2f267a0dc955a03a7adeb83b14870b81aeac6 Author: Patrick de Ruiter Date: Sat Nov 1 10:19:32 2025 +0100 Add comprehensive README and update module documentation diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..1565be3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.tfstate +*.tfstate.backup +.terraform +provider.tf +*.tfvars +**/*.tfvars +provider.tf +.github +.circleci diff --git a/README.md b/README.md new file mode 100644 index 0000000..7f35bdc --- /dev/null +++ b/README.md @@ -0,0 +1,211 @@ +# Terraform AWS Datadog2 Integration & Monitoring Module + +## Overview + +The `terraform-aws-datadog2` module is a comprehensive Terraform configuration that integrates AWS with Datadog for monitoring and alerting. It sets up AWS-Datadog integration and creates pre-configured Datadog monitors to track critical infrastructure metrics. + +## Features + +- Automated AWS-Datadog integration setup +- Pre-configured infrastructure monitors for: + - CPU utilization + - Memory utilization + - System load + - Disk space + - Disk inodes + - Disk usage forecasting (7-day prediction) +- CloudPosse label/tagging context for consistent naming +- Support for both EU and US Datadog endpoints + +## Resources Created + +### AWS Resources (via CloudPosse Module) +- **IAM Role** - Allows Datadog to assume this role for monitoring AWS resources +- **External ID** - Security mechanism for cross-account role assumption +- Associated IAM policies for AWS monitoring permissions + +### Datadog Monitors + +1. **CPU Utilization Monitor** + - Type: Metric alert + - Warning: 50% + - Critical: 60% + +2. **Memory Utilization Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 10% usable memory remaining + - Critical: 5% usable memory remaining + +3. **System Load Monitor** + - Type: Query alert + - Tracks: 5-minute normalized system load + - Evaluation: 30 minutes + - Warning: 2.0 + - Critical: 2.5 + +4. **Disk Space Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 80% used + - Critical: 90% used + +5. **Disk Inodes Monitor** + - Type: Query alert + - Evaluation: 5 minutes + - Warning: 90% used + - Critical: 95% used + +6. **Disk Usage Forecast Monitor** + - Type: Query alert with forecasting + - Prediction: Next 7 days + - Forecast model: Linear + - Warning: 72% predicted usage + - Critical: 80% predicted usage + +## Usage + +```hcl +module "datadog_monitoring" { + source = "path/to/terraform-aws-datadog2" + + # Required variables + region = "eu-west-1" + api_key = var.datadog_api_key # Store securely! + app_key = var.datadog_app_key # Store securely! + aws_profile = "your-aws-profile" + prefix_slug = "mycompany" + team = "platform" + + # Optional variables + datadog_site = "https://api.datadoghq.eu/" # Default + + # CloudPosse label context (optional) + namespace = "myorg" + environment = "prod" + stage = "production" + name = "monitoring" + + tags = { + Project = "Infrastructure" + ManagedBy = "Terraform" + } +} +``` + +## Variables + +### Required Variables + +| Variable | Type | Description | +|----------|------|-------------| +| `region` | string | AWS region where monitored resources reside | +| `api_key` | string | Datadog API key for sending logs, metrics, and traces | +| `app_key` | string | Datadog application key for API manipulation | +| `aws_profile` | string | AWS profile name for authentication | +| `prefix_slug` | string | Prefix slug for naming | +| `team` | string | Team identifier | + +### Optional Variables + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `datadog_site` | string | `https://api.datadoghq.eu/` | Datadog site endpoint | + +### CloudPosse Label Context Variables + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `enabled` | bool | null | Enable/disable resource creation | +| `namespace` | string | null | Organization name or abbreviation | +| `environment` | string | null | Environment identifier | +| `stage` | string | null | Stage identifier | +| `name` | string | null | Solution name | +| `delimiter` | string | null | Delimiter between name components | +| `attributes` | list(string) | [] | Additional attributes for naming | +| `tags` | map(string) | {} | Additional tags | +| `label_order` | list(string) | null | Custom ordering of name components | + +## Outputs + +| Output | Description | +|--------|-------------| +| `aws_account_id` | AWS Account ID of the IAM Role for Datadog | +| `aws_role_name` | Name of the AWS IAM Role for Datadog | +| `datadog_external_id` | External ID for secure role assumption | + +**Note:** These outputs are essential for completing the Datadog integration by providing values to enter in Datadog's AWS integration settings. + +## Dependencies + +### Terraform Requirements +- Terraform >= 0.13.0 + +### Provider Requirements +- `hashicorp/aws` - AWS infrastructure management +- `datadog/datadog` - Datadog monitoring resources +- `hashicorp/local` >= 1.3 - Local file operations + +### External Modules +1. **cloudposse/datadog-integration/aws** (v0.11.0) + - Creates AWS IAM role and permissions for Datadog + - Handles cross-account role assumption + +2. **cloudposse/label/null** (v0.24.1) + - Provides consistent tagging and naming conventions + +### Prerequisites +- Valid AWS account with IAM role creation permissions +- Active Datadog account with monitor creation access +- Network connectivity to AWS and Datadog APIs +- Proper AWS profile configured + +## Post-Deployment Setup + +After applying this module, complete the integration in Datadog: + +1. Navigate to AWS integration settings in Datadog console +2. Add AWS account using the `aws_account_id` output +3. Add the `aws_role_name` as the IAM role name +4. Provide the `datadog_external_id` as the external ID +5. Complete the AWS integration in Datadog console + +## Monitor Alert Notifications + +To receive alerts, configure notification channels in Datadog and update the monitors to include your notification preferences. + +## Customization + +### Adjusting Monitor Thresholds + +To adjust alert thresholds, modify the monitor resources in `monitors.tf`: + +```hcl +# Example: Adjust CPU warning to 60% and critical to 80% +resource "datadog_monitor" "cpumonitor" { + # ... other settings ... + thresholds = { + warning = 60 + critical = 80 + } +} +``` + +### Adding Additional Monitors + +Add new monitor resources to `monitors.tf` following the existing patterns. + +## Security Considerations + +- Store API keys and app keys securely (use Terraform Cloud, AWS Secrets Manager, or HashiCorp Vault) +- Never commit sensitive credentials to version control +- Use IAM role-based access instead of IAM user credentials where possible +- Review and adjust monitor thresholds based on your workload requirements + +## License + +See project license file. + +## Authors + +Maintained by WebBuildYourCloud team. diff --git a/context.tf b/context.tf new file mode 100755 index 0000000..307f711 --- /dev/null +++ b/context.tf @@ -0,0 +1,184 @@ +module "this" { + source = "cloudposse/label/null" + version = "0.24.1" # requires Terraform >= 0.13.0 + + enabled = var.enabled + namespace = var.namespace + environment = var.environment + stage = var.stage + name = var.name + delimiter = var.delimiter + attributes = var.attributes + tags = var.tags + additional_tag_map = var.additional_tag_map + label_order = var.label_order + regex_replace_chars = var.regex_replace_chars + id_length_limit = var.id_length_limit + label_key_case = var.label_key_case + label_value_case = var.label_value_case + + context = var.context +} + +# Copy contents of cloudposse/terraform-null-label/variables.tf here + +variable "context" { + type = any + default = { + enabled = true + namespace = null + environment = null + stage = null + name = null + delimiter = null + attributes = [] + tags = {} + additional_tag_map = {} + regex_replace_chars = null + label_order = [] + id_length_limit = null + label_key_case = null + label_value_case = null + } + description = <<-EOT + Single object for setting entire context at once. + See description of individual variables for details. + Leave string and numeric variables as `null` to use default value. + Individual variable settings (non-null) override settings in context object, + except for attributes, tags, and additional_tag_map, which are merged. + EOT + + validation { + condition = lookup(var.context, "label_key_case", null) == null ? true : contains(["lower", "title", "upper"], var.context["label_key_case"]) + error_message = "Allowed values: `lower`, `title`, `upper`." + } + + validation { + condition = lookup(var.context, "label_value_case", null) == null ? true : contains(["lower", "title", "upper", "none"], var.context["label_value_case"]) + error_message = "Allowed values: `lower`, `title`, `upper`, `none`." + } +} + +variable "enabled" { + type = bool + default = null + description = "Set to false to prevent the module from creating any resources" +} + +variable "namespace" { + type = string + default = null + description = "Namespace, which could be your organization name or abbreviation, e.g. 'eg' or 'cp'" +} + +variable "environment" { + type = string + default = null + description = "Environment, e.g. 'uw2', 'us-west-2', OR 'prod', 'staging', 'dev', 'UAT'" +} + +variable "stage" { + type = string + default = null + description = "Stage, e.g. 'prod', 'staging', 'dev', OR 'source', 'build', 'test', 'deploy', 'release'" +} + +variable "name" { + type = string + default = null + description = "Solution name, e.g. 'app' or 'jenkins'" +} + +variable "delimiter" { + type = string + default = null + description = <<-EOT + Delimiter to be used between `namespace`, `environment`, `stage`, `name` and `attributes`. + Defaults to `-` (hyphen). Set to `""` to use no delimiter at all. + EOT +} + +variable "attributes" { + type = list(string) + default = [] + description = "Additional attributes (e.g. `1`)" +} + +variable "tags" { + type = map(string) + default = {} + description = "Additional tags (e.g. `map('BusinessUnit','XYZ')`" +} + +variable "additional_tag_map" { + type = map(string) + default = {} + description = "Additional tags for appending to tags_as_list_of_maps. Not added to `tags`." +} + +variable "label_order" { + type = list(string) + default = null + description = <<-EOT + The naming order of the id output and Name tag. + Defaults to ["namespace", "environment", "stage", "name", "attributes"]. + You can omit any of the 5 elements, but at least one must be present. + EOT +} + +variable "regex_replace_chars" { + type = string + default = null + description = <<-EOT + Regex to replace chars with empty string in `namespace`, `environment`, `stage` and `name`. + If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. + EOT +} + +variable "id_length_limit" { + type = number + default = null + description = <<-EOT + Limit `id` to this many characters (minimum 6). + Set to `0` for unlimited length. + Set to `null` for default, which is `0`. + Does not affect `id_full`. + EOT + validation { + condition = var.id_length_limit == null ? true : var.id_length_limit >= 6 || var.id_length_limit == 0 + error_message = "The id_length_limit must be >= 6 if supplied (not null), or 0 for unlimited length." + } +} + +variable "label_key_case" { + type = string + default = null + description = <<-EOT + The letter case of label keys (`tag` names) (i.e. `name`, `namespace`, `environment`, `stage`, `attributes`) to use in `tags`. + Possible values: `lower`, `title`, `upper`. + Default value: `title`. + EOT + + validation { + condition = var.label_key_case == null ? true : contains(["lower", "title", "upper"], var.label_key_case) + error_message = "Allowed values: `lower`, `title`, `upper`." + } +} + +variable "label_value_case" { + type = string + default = null + description = <<-EOT + The letter case of output label values (also used in `tags` and `id`). + Possible values: `lower`, `title`, `upper` and `none` (no transformation). + Default value: `lower`. + EOT + + validation { + condition = var.label_value_case == null ? true : contains(["lower", "title", "upper", "none"], var.label_value_case) + error_message = "Allowed values: `lower`, `title`, `upper`, `none`." + } +} +#### End of copy of cloudposse/terraform-null-label/variables.tf + + diff --git a/main.tf b/main.tf new file mode 100755 index 0000000..51b5bf9 --- /dev/null +++ b/main.tf @@ -0,0 +1,9 @@ +module "datadog_integration" { + source = "cloudposse/datadog-integration/aws" + version = "0.11.0" + namespace = "plaap" + stage = "test" + name = "datadog-integration" + integrations = [ "all" ] +} + diff --git a/monitors.tf b/monitors.tf new file mode 100755 index 0000000..7e82a36 --- /dev/null +++ b/monitors.tf @@ -0,0 +1,137 @@ +# Monitor CPU Utilisation +resource "datadog_monitor" "cpumonitor" { + name = "cpu monitor" + type = "metric alert" + message = "CPU usage alert" + query = "avg(last_1m):avg:system.cpu.system{*} by {host} > 60" + monitor_thresholds { + ok = 20 + warning = 50 + critical = 60 + } +} + +# Monitor Memory Utilisation +resource "datadog_monitor" "memorymonitor" { + name = "Usable Memory" + type = "query alert" + evaluation_delay = "15" + include_tags = true + locked = false + message = "Plaaper de plaaper de plaap" + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + #notify_no_data = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + timeout_h = 0 + query = "max(last_5m):avg:system.mem.usable{*} by {host} / avg:system.mem.total{*} by {host} * 100 < 5" + + monitor_thresholds { + critical = 5 + warning = 10 + } +} + +# Monitor System Load +resource "datadog_monitor" "systemload" { + name = "System Load" + type = "query alert" + evaluation_delay = "15" + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = false + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "System Load exceeding set Thresholg, please investigate" + query = "min(last_30m):( avg:system.load.norm.5{*} by {host} ) > 2.5" + + monitor_thresholds { + critical = "2.5" + warning = "2" + } + +} +# Monitor Disk Utilisation +resource "datadog_monitor" "disk_usage" { + name = "Disk Space" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Usage is exceeding set threshold, please investigate" + query = "max(last_5m):avg:system.disk.in_use{*} by {host,device} * 100 > 90" + + monitor_thresholds { + critical = "90" + warning = "80" + } +} + +# Monitor Disk Inode Usage +resource "datadog_monitor" "disk_inodes" { + name = "Disk Inodes Usage" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Inode Usage is exceeding set threshold, please investigate" + query = "min(last_5m):avg:system.fs.inodes.in_use{*} by {host,device} * 100 > 95" + + monitor_thresholds { + critical = "95" + warning = "90" + } +} + +# Monitor Disk Space Forecast +resource "datadog_monitor" "disk_forecast" { + name = "Disk Usage Forecast" + type = "query alert" + evaluation_delay = 15 + include_tags = true + locked = false + new_host_delay = 300 + no_data_timeframe = 0 + #notify_audit = 0 + priority = 0 + renotify_interval = 0 + require_full_window = true + #restricted_roles = [] + tags = [] + timeout_h = 0 + message = "Disk Usage is exceeding set threshold, please investigate" + query = "max(next_1w):forecast(avg:system.disk.in_use{*} by {host,device} * 100, 'linear', 1, interval='60m', history='1w', model='default') >= 80" + + monitor_thresholds { + critical = "80" + warning = "72" + } +} \ No newline at end of file diff --git a/outputs.tf b/outputs.tf new file mode 100755 index 0000000..091d06e --- /dev/null +++ b/outputs.tf @@ -0,0 +1,16 @@ +output "aws_account_id" { + value = module.datadog_integration.aws_account_id + description = "AWS Account ID of the IAM Role for Datadog to use for this integration" +} + +output "aws_role_name" { + value = module.datadog_integration.aws_role_name + description = "Name of the AWS IAM Role for Datadog to use for this integration" +} + +output "datadog_external_id" { + value = module.datadog_integration.datadog_external_id + description = "Datadog integration external ID" +} + + diff --git a/variables.tf b/variables.tf new file mode 100755 index 0000000..af04660 --- /dev/null +++ b/variables.tf @@ -0,0 +1,33 @@ +variable "region" { + type = string + description = "The AWS region that the resources to be monitored reside in" +} + +variable "api_key" { + type = string + description = "The api_key that is used to send logs, metrics and traces to the datadog account" +} + +variable "app_key" { + type = string + description = "The app_key that is used to manipulate the datadog API" +} + +variable "datadog_site" { + type = string + description = "Datadog site to connect to (EU or US)" + default = "https://api.datadoghq.eu/" +} + +variable "aws_profile" { + type = string + description = "Which AWS account is this for" +} + +variable "prefix_slug" { + type = string +} + +variable "team" { + type = string +} diff --git a/versions.tf b/versions.tf new file mode 100755 index 0000000..6e26982 --- /dev/null +++ b/versions.tf @@ -0,0 +1,18 @@ +terraform { + required_version = ">= 0.13.0" + + required_providers { + aws = { + source = "hashicorp/aws" + #version = ">= 2.0" + } + local = { + source = "hashicorp/local" + version = ">= 1.3" + } + datadog = { + source = "datadog/datadog" + #version = ">= 2.12" + } + } +}