From 924c6f0b99e960a36ffe7ca2da01d1af5548ce1c Mon Sep 17 00:00:00 2001 From: greglebreton Date: Wed, 9 Nov 2022 14:41:41 +0100 Subject: [PATCH] push --- .env | 2 + README.md | 22 +++++ alertmanager/alert.rules | 172 ++++++++++++++++++++++++++++++++++ alertmanager/alert.rules.BAK | 172 ++++++++++++++++++++++++++++++++++ alertmanager/alertmanager.yml | 39 ++++++++ alertmanager/config.yml | 11 +++ alertmanager/custom-alert.yml | 37 ++++++++ docker-compose.yml | 135 ++++++++++++++++++++++++++ prometheus/prometheus.yml | 53 +++++++++++ 9 files changed, 643 insertions(+) create mode 100644 .env create mode 100644 README.md create mode 100644 alertmanager/alert.rules create mode 100644 alertmanager/alert.rules.BAK create mode 100644 alertmanager/alertmanager.yml create mode 100644 alertmanager/config.yml create mode 100644 alertmanager/custom-alert.yml create mode 100644 docker-compose.yml create mode 100644 prometheus/prometheus.yml diff --git a/.env b/.env new file mode 100644 index 0000000..8f82ec8 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +ADMIN_USER=admin +ADMIN_PASSWORD=password \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..293409a --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# PROMETHEUS & GRAFANA + +Ce projet vise à monitorer un server avec une stack Docker. + +## CONFIGURATION + +- Configuration des accès via Caddy: +```bash +nano .env +``` + +- Configuration de la boîte mail pour les alertes: +```bash +nano alertmanager/alertmanager.yml +``` + +- Configuration des alertes: +```bash +nano alertmanager/alert.rules +``` + +> Grafana est accessible via l'adresse: http://:3000 \ No newline at end of file diff --git a/alertmanager/alert.rules b/alertmanager/alert.rules new file mode 100644 index 0000000..0dc75f7 --- /dev/null +++ b/alertmanager/alert.rules @@ -0,0 +1,172 @@ +groups: +- name: targets + rules: + - alert: monitor_service_down + expr: up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Monitor service non-operational" + description: "Service {{ $labels.instance }} is down." + +## FOR HOST ################################################################## + +- name: host + rules: + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: high_memory_load + expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server memory is almost full" + description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: high_storage_load + expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server storage is almost full" + description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +## FOR RAID ########################################################## + + - alert: HostRaidArrayGotInactive + expr: node_md_state{state="inactive"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +## FOR CONTAINERS ##################################################### + +- name: containers + rules: + - alert: nextcloud_down + expr: absent(container_memory_usage_bytes{name="jenkins"}) + for: 30s + labels: + severity: critical + annotations: + summary: "Nextcloud down" + description: "Nextcloud container is down for more than 30 seconds." + + - alert: ContainerCpuUsage + expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container CPU usage (instance {{ $labels.instance }}) + description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerMemoryUsage + expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +## FOR NGINX ########################################################## + +- name: nginx + rules: + - alert: NginxHighHttp4xxErrorRate + expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NginxHighHttp5xxErrorRate + expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + diff --git a/alertmanager/alert.rules.BAK b/alertmanager/alert.rules.BAK new file mode 100644 index 0000000..0dc75f7 --- /dev/null +++ b/alertmanager/alert.rules.BAK @@ -0,0 +1,172 @@ +groups: +- name: targets + rules: + - alert: monitor_service_down + expr: up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Monitor service non-operational" + description: "Service {{ $labels.instance }} is down." + +## FOR HOST ################################################################## + +- name: host + rules: + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: high_memory_load + expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server memory is almost full" + description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: high_storage_load + expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server storage is almost full" + description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +## FOR RAID ########################################################## + + - alert: HostRaidArrayGotInactive + expr: node_md_state{state="inactive"} > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRaidDiskFailure + expr: node_md_disks{state="failed"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +## FOR CONTAINERS ##################################################### + +- name: containers + rules: + - alert: nextcloud_down + expr: absent(container_memory_usage_bytes{name="jenkins"}) + for: 30s + labels: + severity: critical + annotations: + summary: "Nextcloud down" + description: "Nextcloud container is down for more than 30 seconds." + + - alert: ContainerCpuUsage + expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container CPU usage (instance {{ $labels.instance }}) + description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerMemoryUsage + expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 + for: 2m + labels: + severity: warning + annotations: + summary: Container Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + +## FOR NGINX ########################################################## + +- name: nginx + rules: + - alert: NginxHighHttp4xxErrorRate + expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NginxHighHttp5xxErrorRate + expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml new file mode 100644 index 0000000..320dfd9 --- /dev/null +++ b/alertmanager/alertmanager.yml @@ -0,0 +1,39 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 5m + repeat_interval: 10m + receiver: 'email' + +receivers: +- name: 'email' + email_configs: + - to: 'mail1@mail.com, mail2@mail.com' + from: '' + smarthost: + auth_username: '' + auth_identity: '' + auth_password: '' + require_tls: yes + send_resolved: true + +# mute_time_intervals: +# - name: out-of-business-hours +# time_intervals: +# - weekdays: ['Saturday','Sunday'] +# - times: +# - start_time: '00:00' +# end_time: '08:00' +# - start_time: '18:00' +# end_time: '24:00' + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'dev', 'instance'] + diff --git a/alertmanager/config.yml b/alertmanager/config.yml new file mode 100644 index 0000000..81158ae --- /dev/null +++ b/alertmanager/config.yml @@ -0,0 +1,11 @@ +route: + receiver: 'slack' + +receivers: + - name: 'slack' + slack_configs: + - send_resolved: true + text: "{{ .CommonAnnotations.description }}" + username: 'Prometheus' + channel: '#prometheus' + api_url: 'https://hooks.slack.com/services/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' diff --git a/alertmanager/custom-alert.yml b/alertmanager/custom-alert.yml new file mode 100644 index 0000000..fc396c2 --- /dev/null +++ b/alertmanager/custom-alert.yml @@ -0,0 +1,37 @@ +# Whether to notify about resolved alerts. +[ send_resolved: | default = false ] + +# The email address to send notifications to. +to: + +# The sender's address. +[ from: | default = global.smtp_from ] + +# The SMTP host through which emails are sent. +[ smarthost: | default = global.smtp_smarthost ] + +# The hostname to identify to the SMTP server. +[ hello: | default = global.smtp_hello ] + +# SMTP authentication information. +[ auth_username: | default = global.smtp_auth_username ] +[ auth_password: | default = global.smtp_auth_password ] +[ auth_secret: | default = global.smtp_auth_secret ] +[ auth_identity: | default = global.smtp_auth_identity ] + +# The SMTP TLS requirement. +# Note that Go does not support unencrypted connections to remote SMTP endpoints. +[ require_tls: | default = global.smtp_require_tls ] + +# TLS configuration. +tls_config: + [ ] + +# The HTML body of the email notification. +[ html: | default = '{{ template "email.default.html" . }}' ] +# The text body of the email notification. +[ text: ] + +# Further headers email header key/value pairs. Overrides any headers +# previously set by the notification implementation. +[ headers: { : , ... } ] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..58efe1a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,135 @@ +version: '2.1' + +networks: + monitor-net: + driver: bridge + +volumes: + prometheus_data: {} + grafana_data: {} + +services: + + prometheus: + image: prom/prometheus:v2.17.1 + container_name: prometheus + volumes: + - ./prometheus:/etc/prometheus + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + restart: unless-stopped + expose: + - 9090 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + alertmanager: + image: prom/alertmanager:v0.20.0 + container_name: alertmanager + volumes: + - ./alertmanager:/etc/alertmanager + command: + #- '--config.file=/etc/alertmanager/config.yml' + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + restart: unless-stopped + expose: + - 9093 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + nodeexporter: + image: prom/node-exporter:v0.18.1 + container_name: nodeexporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' + restart: unless-stopped + expose: + - 9100 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + cadvisor: + #image: gcr.io/google-containers/cadvisor:v0.34.0 + image: gcr.io/cadvisor/cadvisor + container_name: cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + #- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux + restart: unless-stopped + expose: + - 8080 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + grafana: + image: grafana/grafana:6.7.2 + container_name: grafana + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + environment: + - GF_SECURITY_ADMIN_USER=${ADMIN_USER} + - GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD} + - GF_USERS_ALLOW_SIGN_UP=false + restart: unless-stopped + expose: + - 3000 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + pushgateway: + image: prom/pushgateway:v1.2.0 + container_name: pushgateway + restart: unless-stopped + expose: + - 9091 + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" + + caddy: + image: stefanprodan/caddy + container_name: caddy + ports: + - "3000:3000" + - "9090:9090" + - "9093:9093" + - "9091:9091" + volumes: + - ./caddy:/etc/caddy + environment: + - ADMIN_USER=${ADMIN_USER} + - ADMIN_PASSWORD=${ADMIN_PASSWORD} + restart: unless-stopped + networks: + - monitor-net + labels: + org.label-schema.group: "monitoring" diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml new file mode 100644 index 0000000..7906963 --- /dev/null +++ b/prometheus/prometheus.yml @@ -0,0 +1,53 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'docker-host-alpha' + +# Load and evaluate rules in this file every 'evaluation_interval' seconds. +rule_files: + - "alert.rules" + +# A scrape configuration containing exactly one endpoint to scrape. +scrape_configs: + - job_name: 'nodeexporter' + scrape_interval: 5s + static_configs: + - targets: ['nodeexporter:9100'] + + - job_name: 'cadvisor' + scrape_interval: 5s + static_configs: + - targets: ['cadvisor:8080'] + + - job_name: 'prometheus' + scrape_interval: 10s + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'pushgateway' + scrape_interval: 10s + honor_labels: true + static_configs: + - targets: ['pushgateway:9091'] + + +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - 'alertmanager:9093' + +# - job_name: 'nginx' +# scrape_interval: 10s +# static_configs: +# - targets: ['nginxexporter:9113'] + +# - job_name: 'aspnetcore' +# scrape_interval: 10s +# static_configs: +# - targets: ['eventlog-proxy:5000', 'eventlog:5000']