You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
172 lines
6.8 KiB
172 lines
6.8 KiB
groups:
|
|
- name: targets
|
|
rules:
|
|
- alert: monitor_service_down
|
|
expr: up == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Monitor service non-operational"
|
|
description: "Service {{ $labels.instance }} is down."
|
|
|
|
## FOR HOST ##################################################################
|
|
|
|
- name: host
|
|
rules:
|
|
- alert: HostHighCpuLoad
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: high_memory_load
|
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Server memory is almost full"
|
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
|
|
|
- alert: HostPhysicalComponentTooHot
|
|
expr: node_hwmon_temp_celsius > 75
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostSwapIsFillingUp
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: high_storage_load
|
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
|
for: 30s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Server storage is almost full"
|
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
|
|
|
- alert: HostOutOfMemory
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of memory (instance {{ $labels.instance }})
|
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputIn
|
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostUnusualNetworkThroughputOut
|
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
## FOR RAID ##########################################################
|
|
|
|
- alert: HostRaidArrayGotInactive
|
|
expr: node_md_state{state="inactive"} > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HostRaidDiskFailure
|
|
expr: node_md_disks{state="failed"} > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|
|
## FOR CONTAINERS #####################################################
|
|
|
|
- name: containers
|
|
rules:
|
|
- alert: nextcloud_down
|
|
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Nextcloud down"
|
|
description: "Nextcloud container is down for more than 30 seconds."
|
|
|
|
- alert: ContainerCpuUsage
|
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Container CPU usage (instance {{ $labels.instance }})
|
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ContainerMemoryUsage
|
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Container Memory usage (instance {{ $labels.instance }})
|
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|
|
## FOR NGINX ##########################################################
|
|
|
|
- name: nginx
|
|
rules:
|
|
- alert: NginxHighHttp4xxErrorRate
|
|
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
|
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NginxHighHttp5xxErrorRate
|
|
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
|
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
|