resource "datadog_monitor" "cpu_80_15min" {
  name    = "CPU High > 80% for 15 min"
  message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"
  count   = "${var.linux_basics == "enabled" ? 1 : 0}"

  query = "min(last_15m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 80"
  type  = "query alert"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "cpu_95_5min" {
  name    = "CPU High > 95% for 5 min"
  message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"

  query = "min(last_5m):avg:system.cpu.system{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} + avg:system.cpu.user{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host} > 95"
  type  = "query alert"
  count   = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_free_disk_space_5" {
  name   = "Free disk space < 5%"
  message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}"

  query   = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
  type  = "query alert"
  count   = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_free_disk_space_10" {
  name    = "Free disk space < 10%"
  message = "{{#is_alert}}\n${var.hno_escalation_group}\n{{/is_alert}}\n{{#is_recovery}}\n${var.hno_escalation_group}\n{{/is_recovery}}\n{{#is_warning}}\n${var.ho_escalation_group}\n{{/is_warning}}\n{{#is_warning_recovery}}\n${var.ho_escalation_group}\n{{/is_warning_recovery}}"

  query = "sum(last_5m):avg:system.disk.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.disk.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
  type  = "query alert"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  thresholds {
    warning  = 20
    critical = 10
  }

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_free_disk_space_inodes_5" {
  name    = "Free disk inodes < 5%"
  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"

  query = "sum(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 5"
  type  = "query alert"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_free_disk_space_inodes_10" {
  name    = "Free disk inodes < 10%"
  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"

  query = "max(last_5m):avg:system.fs.inodes.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} / avg:system.fs.inodes.total{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {host,device} * 100 < 10"
  type  = "query alert"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  thresholds {
    warning  = 20
    critical = 10
  }

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_cpu_load" {
  name    = "CPU Load > 2"
  message = "Debugging alert - no escalation"

  query = "min(last_5m):avg:system.load.5{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled} by {instance-id} / avg:gcp.gce.instance.cpu.reserved_cores{*} by {instance-id} > 2"
  type  = "query alert"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_free_memory" {
  name   = "Free memory < 5%"
  message = "Debugging alert - no escalation"

  query = "sum(last_1m):avg:system.mem.free{dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_memory:enabled} by {host} / avg:system.mem.total{*} by {host} * 100 < 5"
  type  = "query alert"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = false
  renotify_interval   = 0
  no_data_timeframe   = 20
}

resource "datadog_monitor" "datadog_host_unreachable" {
  name    = "Host unreachable"
  message = "{{#is_alert}}\n${var.hno_escalation_group} \n{{/is_alert}} \n{{#is_recovery}}\n${var.hno_escalation_group} \n{{/is_recovery}}"

  query = "\"datadog.agent.up\".over(\"dd_monitoring:enabled,dd_linux_basics:enabled,!dd_custom_cpu:enabled\").last(1).count_by_status()"
  type  = "service check"
  count = "${var.linux_basics == "enabled" ? 1 : 0}"

  notify_no_data      = false
  renotify_interval   = 60
  notify_audit        = false
  timeout_h           = 0
  include_tags        = true
  locked              = false
  require_full_window = true
  new_host_delay      = 300
  notify_no_data      = true
  renotify_interval   = 0
  no_data_timeframe   = 20
}