commit
924c6f0b99
@ -0,0 +1,22 @@ |
|||||||
|
# PROMETHEUS & GRAFANA |
||||||
|
|
||||||
|
Ce projet vise à monitorer un server avec une stack Docker. |
||||||
|
|
||||||
|
## CONFIGURATION |
||||||
|
|
||||||
|
- Configuration des accès via Caddy: |
||||||
|
```bash |
||||||
|
nano .env |
||||||
|
``` |
||||||
|
|
||||||
|
- Configuration de la boîte mail pour les alertes: |
||||||
|
```bash |
||||||
|
nano alertmanager/alertmanager.yml |
||||||
|
``` |
||||||
|
|
||||||
|
- Configuration des alertes: |
||||||
|
```bash |
||||||
|
nano alertmanager/alert.rules |
||||||
|
``` |
||||||
|
|
||||||
|
> Grafana est accessible via l'adresse: http://<IP-SERVER>:3000 |
@ -0,0 +1,172 @@ |
|||||||
|
groups: |
||||||
|
- name: targets |
||||||
|
rules: |
||||||
|
- alert: monitor_service_down |
||||||
|
expr: up == 0 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: "Monitor service non-operational" |
||||||
|
description: "Service {{ $labels.instance }} is down." |
||||||
|
|
||||||
|
## FOR HOST ################################################################## |
||||||
|
|
||||||
|
- name: host |
||||||
|
rules: |
||||||
|
- alert: HostHighCpuLoad |
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 |
||||||
|
for: 0m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }}) |
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: high_memory_load |
||||||
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: "Server memory is almost full" |
||||||
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." |
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot |
||||||
|
expr: node_hwmon_temp_celsius > 75 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }}) |
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp |
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }}) |
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: high_storage_load |
||||||
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: "Server storage is almost full" |
||||||
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." |
||||||
|
|
||||||
|
- alert: HostOutOfMemory |
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host out of memory (instance {{ $labels.instance }}) |
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn |
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }}) |
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut |
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }}) |
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace |
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }}) |
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
## FOR RAID ########################################################## |
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive |
||||||
|
expr: node_md_state{state="inactive"} > 0 |
||||||
|
for: 0m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }}) |
||||||
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure |
||||||
|
expr: node_md_disks{state="failed"} > 0 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }}) |
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
|
||||||
|
## FOR CONTAINERS ##################################################### |
||||||
|
|
||||||
|
- name: containers |
||||||
|
rules: |
||||||
|
- alert: nextcloud_down |
||||||
|
expr: absent(container_memory_usage_bytes{name="jenkins"}) |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: "Nextcloud down" |
||||||
|
description: "Nextcloud container is down for more than 30 seconds." |
||||||
|
|
||||||
|
- alert: ContainerCpuUsage |
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Container CPU usage (instance {{ $labels.instance }}) |
||||||
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: ContainerMemoryUsage |
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Container Memory usage (instance {{ $labels.instance }}) |
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
|
||||||
|
## FOR NGINX ########################################################## |
||||||
|
|
||||||
|
- name: nginx |
||||||
|
rules: |
||||||
|
- alert: NginxHighHttp4xxErrorRate |
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 |
||||||
|
for: 1m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) |
||||||
|
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: NginxHighHttp5xxErrorRate |
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 |
||||||
|
for: 1m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) |
||||||
|
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
@ -0,0 +1,172 @@ |
|||||||
|
groups: |
||||||
|
- name: targets |
||||||
|
rules: |
||||||
|
- alert: monitor_service_down |
||||||
|
expr: up == 0 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: "Monitor service non-operational" |
||||||
|
description: "Service {{ $labels.instance }} is down." |
||||||
|
|
||||||
|
## FOR HOST ################################################################## |
||||||
|
|
||||||
|
- name: host |
||||||
|
rules: |
||||||
|
- alert: HostHighCpuLoad |
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 |
||||||
|
for: 0m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }}) |
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: high_memory_load |
||||||
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: "Server memory is almost full" |
||||||
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." |
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot |
||||||
|
expr: node_hwmon_temp_celsius > 75 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }}) |
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp |
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }}) |
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: high_storage_load |
||||||
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: "Server storage is almost full" |
||||||
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." |
||||||
|
|
||||||
|
- alert: HostOutOfMemory |
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host out of memory (instance {{ $labels.instance }}) |
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn |
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }}) |
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut |
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 |
||||||
|
for: 5m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }}) |
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace |
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }}) |
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
## FOR RAID ########################################################## |
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive |
||||||
|
expr: node_md_state{state="inactive"} > 0 |
||||||
|
for: 0m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }}) |
||||||
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure |
||||||
|
expr: node_md_disks{state="failed"} > 0 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }}) |
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
|
||||||
|
## FOR CONTAINERS ##################################################### |
||||||
|
|
||||||
|
- name: containers |
||||||
|
rules: |
||||||
|
- alert: nextcloud_down |
||||||
|
expr: absent(container_memory_usage_bytes{name="jenkins"}) |
||||||
|
for: 30s |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: "Nextcloud down" |
||||||
|
description: "Nextcloud container is down for more than 30 seconds." |
||||||
|
|
||||||
|
- alert: ContainerCpuUsage |
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Container CPU usage (instance {{ $labels.instance }}) |
||||||
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: ContainerMemoryUsage |
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 |
||||||
|
for: 2m |
||||||
|
labels: |
||||||
|
severity: warning |
||||||
|
annotations: |
||||||
|
summary: Container Memory usage (instance {{ $labels.instance }}) |
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
|
||||||
|
## FOR NGINX ########################################################## |
||||||
|
|
||||||
|
- name: nginx |
||||||
|
rules: |
||||||
|
- alert: NginxHighHttp4xxErrorRate |
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 |
||||||
|
for: 1m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) |
||||||
|
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
||||||
|
- alert: NginxHighHttp5xxErrorRate |
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 |
||||||
|
for: 1m |
||||||
|
labels: |
||||||
|
severity: critical |
||||||
|
annotations: |
||||||
|
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) |
||||||
|
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" |
||||||
|
|
@ -0,0 +1,39 @@ |
|||||||
|
global: |
||||||
|
resolve_timeout: 5m |
||||||
|
|
||||||
|
route: |
||||||
|
group_by: ['alertname'] |
||||||
|
group_wait: 10s |
||||||
|
group_interval: 5m |
||||||
|
repeat_interval: 10m |
||||||
|
receiver: 'email' |
||||||
|
|
||||||
|
receivers: |
||||||
|
- name: 'email' |
||||||
|
email_configs: |
||||||
|
- to: 'mail1@mail.com, mail2@mail.com' |
||||||
|
from: '' |
||||||
|
smarthost: |
||||||
|
auth_username: '' |
||||||
|
auth_identity: '' |
||||||
|
auth_password: '' |
||||||
|
require_tls: yes |
||||||
|
send_resolved: true |
||||||
|
|
||||||
|
# mute_time_intervals: |
||||||
|
# - name: out-of-business-hours |
||||||
|
# time_intervals: |
||||||
|
# - weekdays: ['Saturday','Sunday'] |
||||||
|
# - times: |
||||||
|
# - start_time: '00:00' |
||||||
|
# end_time: '08:00' |
||||||
|
# - start_time: '18:00' |
||||||
|
# end_time: '24:00' |
||||||
|
|
||||||
|
inhibit_rules: |
||||||
|
- source_match: |
||||||
|
severity: 'critical' |
||||||
|
target_match: |
||||||
|
severity: 'warning' |
||||||
|
equal: ['alertname', 'dev', 'instance'] |
||||||
|
|
@ -0,0 +1,11 @@ |
|||||||
|
route: |
||||||
|
receiver: 'slack' |
||||||
|
|
||||||
|
receivers: |
||||||
|
- name: 'slack' |
||||||
|
slack_configs: |
||||||
|
- send_resolved: true |
||||||
|
text: "{{ .CommonAnnotations.description }}" |
||||||
|
username: 'Prometheus' |
||||||
|
channel: '#prometheus' |
||||||
|
api_url: 'https://hooks.slack.com/services/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' |
@ -0,0 +1,37 @@ |
|||||||
|
# Whether to notify about resolved alerts. |
||||||
|
[ send_resolved: <boolean> | default = false ] |
||||||
|
|
||||||
|
# The email address to send notifications to. |
||||||
|
to: <tmpl_string> |
||||||
|
|
||||||
|
# The sender's address. |
||||||
|
[ from: <tmpl_string> | default = global.smtp_from ] |
||||||
|
|
||||||
|
# The SMTP host through which emails are sent. |
||||||
|
[ smarthost: <string> | default = global.smtp_smarthost ] |
||||||
|
|
||||||
|
# The hostname to identify to the SMTP server. |
||||||
|
[ hello: <string> | default = global.smtp_hello ] |
||||||
|
|
||||||
|
# SMTP authentication information. |
||||||
|
[ auth_username: <string> | default = global.smtp_auth_username ] |
||||||
|
[ auth_password: <secret> | default = global.smtp_auth_password ] |
||||||
|
[ auth_secret: <secret> | default = global.smtp_auth_secret ] |
||||||
|
[ auth_identity: <string> | default = global.smtp_auth_identity ] |
||||||
|
|
||||||
|
# The SMTP TLS requirement. |
||||||
|
# Note that Go does not support unencrypted connections to remote SMTP endpoints. |
||||||
|
[ require_tls: <bool> | default = global.smtp_require_tls ] |
||||||
|
|
||||||
|
# TLS configuration. |
||||||
|
tls_config: |
||||||
|
[ <tls_config> ] |
||||||
|
|
||||||
|
# The HTML body of the email notification. |
||||||
|
[ html: <tmpl_string> | default = '{{ template "email.default.html" . }}' ] |
||||||
|
# The text body of the email notification. |
||||||
|
[ text: <tmpl_string> ] |
||||||
|
|
||||||
|
# Further headers email header key/value pairs. Overrides any headers |
||||||
|
# previously set by the notification implementation. |
||||||
|
[ headers: { <string>: <tmpl_string>, ... } ] |
@ -0,0 +1,135 @@ |
|||||||
|
version: '2.1' |
||||||
|
|
||||||
|
networks: |
||||||
|
monitor-net: |
||||||
|
driver: bridge |
||||||
|
|
||||||
|
volumes: |
||||||
|
prometheus_data: {} |
||||||
|
grafana_data: {} |
||||||
|
|
||||||
|
services: |
||||||
|
|
||||||
|
prometheus: |
||||||
|
image: prom/prometheus:v2.17.1 |
||||||
|
container_name: prometheus |
||||||
|
volumes: |
||||||
|
- ./prometheus:/etc/prometheus |
||||||
|
- prometheus_data:/prometheus |
||||||
|
command: |
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml' |
||||||
|
- '--storage.tsdb.path=/prometheus' |
||||||
|
- '--web.console.libraries=/etc/prometheus/console_libraries' |
||||||
|
- '--web.console.templates=/etc/prometheus/consoles' |
||||||
|
- '--storage.tsdb.retention.time=200h' |
||||||
|
- '--web.enable-lifecycle' |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 9090 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
alertmanager: |
||||||
|
image: prom/alertmanager:v0.20.0 |
||||||
|
container_name: alertmanager |
||||||
|
volumes: |
||||||
|
- ./alertmanager:/etc/alertmanager |
||||||
|
command: |
||||||
|
#- '--config.file=/etc/alertmanager/config.yml' |
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml' |
||||||
|
- '--storage.path=/alertmanager' |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 9093 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
nodeexporter: |
||||||
|
image: prom/node-exporter:v0.18.1 |
||||||
|
container_name: nodeexporter |
||||||
|
volumes: |
||||||
|
- /proc:/host/proc:ro |
||||||
|
- /sys:/host/sys:ro |
||||||
|
- /:/rootfs:ro |
||||||
|
command: |
||||||
|
- '--path.procfs=/host/proc' |
||||||
|
- '--path.rootfs=/rootfs' |
||||||
|
- '--path.sysfs=/host/sys' |
||||||
|
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 9100 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
cadvisor: |
||||||
|
#image: gcr.io/google-containers/cadvisor:v0.34.0 |
||||||
|
image: gcr.io/cadvisor/cadvisor |
||||||
|
container_name: cadvisor |
||||||
|
volumes: |
||||||
|
- /:/rootfs:ro |
||||||
|
- /var/run:/var/run:rw |
||||||
|
- /sys:/sys:ro |
||||||
|
- /var/lib/docker:/var/lib/docker:ro |
||||||
|
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 8080 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
grafana: |
||||||
|
image: grafana/grafana:6.7.2 |
||||||
|
container_name: grafana |
||||||
|
volumes: |
||||||
|
- grafana_data:/var/lib/grafana |
||||||
|
- ./grafana/provisioning:/etc/grafana/provisioning |
||||||
|
environment: |
||||||
|
- GF_SECURITY_ADMIN_USER=${ADMIN_USER} |
||||||
|
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD} |
||||||
|
- GF_USERS_ALLOW_SIGN_UP=false |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 3000 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
pushgateway: |
||||||
|
image: prom/pushgateway:v1.2.0 |
||||||
|
container_name: pushgateway |
||||||
|
restart: unless-stopped |
||||||
|
expose: |
||||||
|
- 9091 |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
||||||
|
|
||||||
|
caddy: |
||||||
|
image: stefanprodan/caddy |
||||||
|
container_name: caddy |
||||||
|
ports: |
||||||
|
- "3000:3000" |
||||||
|
- "9090:9090" |
||||||
|
- "9093:9093" |
||||||
|
- "9091:9091" |
||||||
|
volumes: |
||||||
|
- ./caddy:/etc/caddy |
||||||
|
environment: |
||||||
|
- ADMIN_USER=${ADMIN_USER} |
||||||
|
- ADMIN_PASSWORD=${ADMIN_PASSWORD} |
||||||
|
restart: unless-stopped |
||||||
|
networks: |
||||||
|
- monitor-net |
||||||
|
labels: |
||||||
|
org.label-schema.group: "monitoring" |
@ -0,0 +1,53 @@ |
|||||||
|
global: |
||||||
|
scrape_interval: 15s |
||||||
|
evaluation_interval: 15s |
||||||
|
|
||||||
|
# Attach these labels to any time series or alerts when communicating with |
||||||
|
# external systems (federation, remote storage, Alertmanager). |
||||||
|
external_labels: |
||||||
|
monitor: 'docker-host-alpha' |
||||||
|
|
||||||
|
# Load and evaluate rules in this file every 'evaluation_interval' seconds. |
||||||
|
rule_files: |
||||||
|
- "alert.rules" |
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape. |
||||||
|
scrape_configs: |
||||||
|
- job_name: 'nodeexporter' |
||||||
|
scrape_interval: 5s |
||||||
|
static_configs: |
||||||
|
- targets: ['nodeexporter:9100'] |
||||||
|
|
||||||
|
- job_name: 'cadvisor' |
||||||
|
scrape_interval: 5s |
||||||
|
static_configs: |
||||||
|
- targets: ['cadvisor:8080'] |
||||||
|
|
||||||
|
- job_name: 'prometheus' |
||||||
|
scrape_interval: 10s |
||||||
|
static_configs: |
||||||
|
- targets: ['localhost:9090'] |
||||||
|
|
||||||
|
- job_name: 'pushgateway' |
||||||
|
scrape_interval: 10s |
||||||
|
honor_labels: true |
||||||
|
static_configs: |
||||||
|
- targets: ['pushgateway:9091'] |
||||||
|
|
||||||
|
|
||||||
|
alerting: |
||||||
|
alertmanagers: |
||||||
|
- scheme: http |
||||||
|
static_configs: |
||||||
|
- targets: |
||||||
|
- 'alertmanager:9093' |
||||||
|
|
||||||
|
# - job_name: 'nginx' |
||||||
|
# scrape_interval: 10s |
||||||
|
# static_configs: |
||||||
|
# - targets: ['nginxexporter:9113'] |
||||||
|
|
||||||
|
# - job_name: 'aspnetcore' |
||||||
|
# scrape_interval: 10s |
||||||
|
# static_configs: |
||||||
|
# - targets: ['eventlog-proxy:5000', 'eventlog:5000'] |
Loading…
Reference in new issue