maj light version

light
greglebreton 1 year ago
parent ab0c420619
commit b728e9272e
  1. 37
      README.md
  2. 172
      alertmanager/alert.rules
  3. 172
      alertmanager/alert.rules.BAK
  4. 39
      alertmanager/alertmanager.yml
  5. 11
      alertmanager/config.yml
  6. 37
      alertmanager/custom-alert.yml
  7. 16
      docker-compose-telegraf-agent.yml
  8. 72
      docker-compose.yml
  9. 53
      prometheus/prometheus.yml

@ -5,24 +5,23 @@
## Config
- Grafana: grafana/provisioning/datasources/influxdb.yml (password for influxdb)
- InfluxDB:
- Telegraf: telegraf/telegraf.conf (line 63)
### Production use
![PUSE](docs/production.png)
- Telegraf config: telegraf.conf (line 63)
urls = ["http://SERVER-IP:8086"] # required
- InfluxDB: influxdb/config/influxdb.conf
- Telegraf: telegraf/telegraf.conf (line 63 + 74 & 75)
## Usage
### Main monitor
- get docker UID for telegraf (agent only)
```bash
./get-docker-id.sh
```
- credentials
```bash
nano .env
```
- start stack
```bash
docker-compose up -d
@ -33,13 +32,27 @@ docker-compose up -d
docker-compose logs -f
```
### Agent
- config agent
```bash
nano .env
mv docker-compose-telegraf-agent.yml docker-compose.yml
```
- start agent
```bash
docker-compose up -d
```
## Dashboards
- AMD64:
- ARM64:
- [Raspberry host monitoring](https://grafana.com/grafana/dashboards/10578-raspberry-pi-monitoring/)
- [Docker monitoring](https://grafana.com/grafana/dashboards/10585-docker-dashboard/)
---------------------
## Services and Ports
## Services and Ports exemple
### Grafana

@ -1,172 +0,0 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
## FOR HOST ##################################################################
- name: host
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR RAID ##########################################################
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR CONTAINERS #####################################################
- name: containers
rules:
- alert: nextcloud_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Nextcloud down"
description: "Nextcloud container is down for more than 30 seconds."
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR NGINX ##########################################################
- name: nginx
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

@ -1,172 +0,0 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
## FOR HOST ##################################################################
- name: host
rules:
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR RAID ##########################################################
- alert: HostRaidArrayGotInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: node_md_disks{state="failed"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR CONTAINERS #####################################################
- name: containers
rules:
- alert: nextcloud_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Nextcloud down"
description: "Nextcloud container is down for more than 30 seconds."
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## FOR NGINX ##########################################################
- name: nginx
rules:
- alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
severity: critical
annotations:
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

@ -1,39 +0,0 @@
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 5m
repeat_interval: 10m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'mail1@mail.com, mail2@mail.com'
from: ''
smarthost:
auth_username: ''
auth_identity: ''
auth_password: ''
require_tls: yes
send_resolved: true
# mute_time_intervals:
# - name: out-of-business-hours
# time_intervals:
# - weekdays: ['Saturday','Sunday']
# - times:
# - start_time: '00:00'
# end_time: '08:00'
# - start_time: '18:00'
# end_time: '24:00'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']

@ -1,11 +0,0 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
text: "{{ .CommonAnnotations.description }}"
username: 'Prometheus'
channel: '#prometheus'
api_url: 'https://hooks.slack.com/services/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'

@ -1,37 +0,0 @@
# Whether to notify about resolved alerts.
[ send_resolved: <boolean> | default = false ]
# The email address to send notifications to.
to: <tmpl_string>
# The sender's address.
[ from: <tmpl_string> | default = global.smtp_from ]
# The SMTP host through which emails are sent.
[ smarthost: <string> | default = global.smtp_smarthost ]
# The hostname to identify to the SMTP server.
[ hello: <string> | default = global.smtp_hello ]
# SMTP authentication information.
[ auth_username: <string> | default = global.smtp_auth_username ]
[ auth_password: <secret> | default = global.smtp_auth_password ]
[ auth_secret: <secret> | default = global.smtp_auth_secret ]
[ auth_identity: <string> | default = global.smtp_auth_identity ]
# The SMTP TLS requirement.
# Note that Go does not support unencrypted connections to remote SMTP endpoints.
[ require_tls: <bool> | default = global.smtp_require_tls ]
# TLS configuration.
tls_config:
[ <tls_config> ]
# The HTML body of the email notification.
[ html: <tmpl_string> | default = '{{ template "email.default.html" . }}' ]
# The text body of the email notification.
[ text: <tmpl_string> ]
# Further headers email header key/value pairs. Overrides any headers
# previously set by the notification implementation.
[ headers: { <string>: <tmpl_string>, ... } ]

@ -14,18 +14,4 @@ services:
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
- /var/run/docker.sock:/var/run/docker.sock
ports:
- '8125:8125/udp'
# cadvisor:
# image: gcr.io/cadvisor/cadvisor
# # + image arm64
# container_name: cadvisor
# volumes:
# - /:/rootfs:ro
# - /var/run:/var/run:rw
# - /sys:/sys:ro
# - /var/lib/docker:/var/lib/docker:ro
# - /cgroup:/cgroup:ro
# restart: unless-stopped
# ports:
# - '8080:8080'
- '8125:8125/udp'

@ -7,17 +7,10 @@ services:
container_name: influxdb
restart: unless-stopped
env_file: .env
# environment:
# - INFLUXDB_DB=
# - INFLUXDB_USER=
# - INFLUXDB_ADMIN_ENABLED=
# - INFLUXDB_ADMIN_USER=
# - INFLUXDB_ADMIN_PASSWORD=
ports:
- '8086:8086'
volumes:
- influxdb_data:/var/lib/influxdb
- ./influxdb_data:/var/lib/influxdb
- ./influxdb/imports:/imports
- ./influxdb/config:/etc/influxdb/
@ -34,7 +27,7 @@ services:
ports:
- '3000:3000'
volumes:
- grafana_data:/var/lib/grafana
- ./grafana_data:/var/lib/grafana
- ./grafana/provisioning/:/etc/grafana/provisioning/
- ./grafana/dashboards/:/var/lib/grafana/dashboards/
@ -49,7 +42,7 @@ services:
hostname: ${HOSTNAME}
user: "telegraf:${DOCKER_GID}"
volumes:
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
- ./telegraf_data/telegraf.conf:/etc/telegraf/telegraf.conf:ro
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- influxdb
@ -57,62 +50,3 @@ services:
- influxdb
ports:
- '8125:8125/udp'
# # containers metrics
# cadvisor:
# image: gcr.io/cadvisor/cadvisor
# # + image arm64
# container_name: cadvisor
# restart: unless-stopped
# volumes:
# - /:/rootfs:ro
# - /var/run:/var/run:rw
# - /sys:/sys:ro
# - /var/lib/docker:/var/lib/docker:ro
# - /cgroup:/cgroup:ro
# ports:
# - '8080:8080'
# gather metrics and launch alerts
prometheus:
image: prom/prometheus:v2.17.1
container_name: prometheus
restart: unless-stopped
volumes:
- ./prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
ports:
- '9090:9090'
# alert manager!
alertmanager:
image: prom/alertmanager:v0.20.0
container_name: alertmanager
restart: unless-stopped
volumes:
- ./alertmanager:/etc/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- '9093:9093'
pushgateway:
image: prom/pushgateway:v1.2.0
container_name: pushgateway
restart: unless-stopped
ports:
- "9091:9091"
volumes:
grafana_data: {}
influxdb_data: {}
prometheus_data: {}

@ -1,53 +0,0 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'docker-host-alpha'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- "alert.rules"
# A scrape configuration containing exactly one endpoint to scrape.
scrape_configs:
- job_name: 'nodeexporter'
scrape_interval: 5s
static_configs:
- targets: ['nodeexporter:9100']
- job_name: 'cadvisor'
scrape_interval: 5s
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'prometheus'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
- job_name: 'pushgateway'
scrape_interval: 10s
honor_labels: true
static_configs:
- targets: ['pushgateway:9091']
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- 'alertmanager:9093'
# - job_name: 'nginx'
# scrape_interval: 10s
# static_configs:
# - targets: ['nginxexporter:9113']
# - job_name: 'aspnetcore'
# scrape_interval: 10s
# static_configs:
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']
Loading…
Cancel
Save