ajout prom + cadvisor + alertmanager
This commit is contained in:
parent
ad12256bdc
commit
fc92d0cd67
10
README.md
10
README.md
@ -4,13 +4,18 @@
|
|||||||
|
|
||||||
## Config
|
## Config
|
||||||
|
|
||||||
- Grafana: grafana/provisioning/datasources/influxdb.yml
|
- Grafana: grafana/provisioning/datasources/influxdb.yml (password for influxdb)
|
||||||
- InfluxDB:
|
- InfluxDB:
|
||||||
|
- Telegraf: telegraf/telegraf.conf (line 63)
|
||||||
|
|
||||||
### Production use
|
### Production use
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
- Telegraf config: telegraf.conf (line 63)
|
||||||
|
|
||||||
|
urls = ["http://SERVER-IP:8086"] # required
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
- start stack
|
- start stack
|
||||||
@ -48,4 +53,7 @@ docker-compose logs -f
|
|||||||
- Password: admin
|
- Password: admin
|
||||||
- Database: influx
|
- Database: influx
|
||||||
|
|
||||||
|
# To do:
|
||||||
|
|
||||||
|
- [ ] script install
|
||||||
|
- [ ] ajout Cadvisor + Prometheus
|
||||||
|
172
alertmanager/alert.rules
Normal file
172
alertmanager/alert.rules
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
groups:
|
||||||
|
- name: targets
|
||||||
|
rules:
|
||||||
|
- alert: monitor_service_down
|
||||||
|
expr: up == 0
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Monitor service non-operational"
|
||||||
|
description: "Service {{ $labels.instance }} is down."
|
||||||
|
|
||||||
|
## FOR HOST ##################################################################
|
||||||
|
|
||||||
|
- name: host
|
||||||
|
rules:
|
||||||
|
- alert: HostHighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: high_memory_load
|
||||||
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Server memory is almost full"
|
||||||
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: high_storage_load
|
||||||
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Server storage is almost full"
|
||||||
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
## FOR RAID ##########################################################
|
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: node_md_state{state="inactive"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||||
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: node_md_disks{state="failed"} > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
|
||||||
|
## FOR CONTAINERS #####################################################
|
||||||
|
|
||||||
|
- name: containers
|
||||||
|
rules:
|
||||||
|
- alert: nextcloud_down
|
||||||
|
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Nextcloud down"
|
||||||
|
description: "Nextcloud container is down for more than 30 seconds."
|
||||||
|
|
||||||
|
- alert: ContainerCpuUsage
|
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerMemoryUsage
|
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
|
||||||
|
## FOR NGINX ##########################################################
|
||||||
|
|
||||||
|
- name: nginx
|
||||||
|
rules:
|
||||||
|
- alert: NginxHighHttp4xxErrorRate
|
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||||
|
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NginxHighHttp5xxErrorRate
|
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||||
|
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
172
alertmanager/alert.rules.BAK
Normal file
172
alertmanager/alert.rules.BAK
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
groups:
|
||||||
|
- name: targets
|
||||||
|
rules:
|
||||||
|
- alert: monitor_service_down
|
||||||
|
expr: up == 0
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Monitor service non-operational"
|
||||||
|
description: "Service {{ $labels.instance }} is down."
|
||||||
|
|
||||||
|
## FOR HOST ##################################################################
|
||||||
|
|
||||||
|
- name: host
|
||||||
|
rules:
|
||||||
|
- alert: HostHighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: high_memory_load
|
||||||
|
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Server memory is almost full"
|
||||||
|
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: high_storage_load
|
||||||
|
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Server storage is almost full"
|
||||||
|
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
|
||||||
|
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
## FOR RAID ##########################################################
|
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: node_md_state{state="inactive"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||||
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: node_md_disks{state="failed"} > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
|
||||||
|
## FOR CONTAINERS #####################################################
|
||||||
|
|
||||||
|
- name: containers
|
||||||
|
rules:
|
||||||
|
- alert: nextcloud_down
|
||||||
|
expr: absent(container_memory_usage_bytes{name="jenkins"})
|
||||||
|
for: 30s
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Nextcloud down"
|
||||||
|
description: "Nextcloud container is down for more than 30 seconds."
|
||||||
|
|
||||||
|
- alert: ContainerCpuUsage
|
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerMemoryUsage
|
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
|
||||||
|
## FOR NGINX ##########################################################
|
||||||
|
|
||||||
|
- name: nginx
|
||||||
|
rules:
|
||||||
|
- alert: NginxHighHttp4xxErrorRate
|
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||||
|
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NginxHighHttp5xxErrorRate
|
||||||
|
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||||
|
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
39
alertmanager/alertmanager.yml
Normal file
39
alertmanager/alertmanager.yml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 10m
|
||||||
|
receiver: 'email'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'email'
|
||||||
|
email_configs:
|
||||||
|
- to: 'mail1@mail.com, mail2@mail.com'
|
||||||
|
from: ''
|
||||||
|
smarthost:
|
||||||
|
auth_username: ''
|
||||||
|
auth_identity: ''
|
||||||
|
auth_password: ''
|
||||||
|
require_tls: yes
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# mute_time_intervals:
|
||||||
|
# - name: out-of-business-hours
|
||||||
|
# time_intervals:
|
||||||
|
# - weekdays: ['Saturday','Sunday']
|
||||||
|
# - times:
|
||||||
|
# - start_time: '00:00'
|
||||||
|
# end_time: '08:00'
|
||||||
|
# - start_time: '18:00'
|
||||||
|
# end_time: '24:00'
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
equal: ['alertname', 'dev', 'instance']
|
||||||
|
|
11
alertmanager/config.yml
Normal file
11
alertmanager/config.yml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
route:
|
||||||
|
receiver: 'slack'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'slack'
|
||||||
|
slack_configs:
|
||||||
|
- send_resolved: true
|
||||||
|
text: "{{ .CommonAnnotations.description }}"
|
||||||
|
username: 'Prometheus'
|
||||||
|
channel: '#prometheus'
|
||||||
|
api_url: 'https://hooks.slack.com/services/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'
|
37
alertmanager/custom-alert.yml
Normal file
37
alertmanager/custom-alert.yml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# Whether to notify about resolved alerts.
|
||||||
|
[ send_resolved: <boolean> | default = false ]
|
||||||
|
|
||||||
|
# The email address to send notifications to.
|
||||||
|
to: <tmpl_string>
|
||||||
|
|
||||||
|
# The sender's address.
|
||||||
|
[ from: <tmpl_string> | default = global.smtp_from ]
|
||||||
|
|
||||||
|
# The SMTP host through which emails are sent.
|
||||||
|
[ smarthost: <string> | default = global.smtp_smarthost ]
|
||||||
|
|
||||||
|
# The hostname to identify to the SMTP server.
|
||||||
|
[ hello: <string> | default = global.smtp_hello ]
|
||||||
|
|
||||||
|
# SMTP authentication information.
|
||||||
|
[ auth_username: <string> | default = global.smtp_auth_username ]
|
||||||
|
[ auth_password: <secret> | default = global.smtp_auth_password ]
|
||||||
|
[ auth_secret: <secret> | default = global.smtp_auth_secret ]
|
||||||
|
[ auth_identity: <string> | default = global.smtp_auth_identity ]
|
||||||
|
|
||||||
|
# The SMTP TLS requirement.
|
||||||
|
# Note that Go does not support unencrypted connections to remote SMTP endpoints.
|
||||||
|
[ require_tls: <bool> | default = global.smtp_require_tls ]
|
||||||
|
|
||||||
|
# TLS configuration.
|
||||||
|
tls_config:
|
||||||
|
[ <tls_config> ]
|
||||||
|
|
||||||
|
# The HTML body of the email notification.
|
||||||
|
[ html: <tmpl_string> | default = '{{ template "email.default.html" . }}' ]
|
||||||
|
# The text body of the email notification.
|
||||||
|
[ text: <tmpl_string> ]
|
||||||
|
|
||||||
|
# Further headers email header key/value pairs. Overrides any headers
|
||||||
|
# previously set by the notification implementation.
|
||||||
|
[ headers: { <string>: <tmpl_string>, ... } ]
|
@ -1,42 +0,0 @@
|
|||||||
version: '3.6'
|
|
||||||
|
|
||||||
services:
|
|
||||||
|
|
||||||
influxdb:
|
|
||||||
image: influxdb:1.8-alpine
|
|
||||||
env_file: .env
|
|
||||||
ports:
|
|
||||||
- '8086:8086'
|
|
||||||
volumes:
|
|
||||||
- influxdb_data:/var/lib/influxdb
|
|
||||||
- ./influxdb/imports:/imports
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
image: grafana/grafana:8.0.2
|
|
||||||
depends_on:
|
|
||||||
- influxdb
|
|
||||||
env_file: .env
|
|
||||||
links:
|
|
||||||
- influxdb
|
|
||||||
ports:
|
|
||||||
- '3000:3000'
|
|
||||||
volumes:
|
|
||||||
- grafana_data:/var/lib/grafana
|
|
||||||
- ./grafana/provisioning/:/etc/grafana/provisioning/
|
|
||||||
- ./grafana/dashboards/:/var/lib/grafana/dashboards/
|
|
||||||
|
|
||||||
telegraf:
|
|
||||||
image: telegraf:1.18-alpine
|
|
||||||
# image: telegraf:latest #(for amd64)
|
|
||||||
volumes:
|
|
||||||
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
|
||||||
depends_on:
|
|
||||||
- influxdb
|
|
||||||
links:
|
|
||||||
- influxdb
|
|
||||||
ports:
|
|
||||||
- '8125:8125/udp'
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
grafana_data: {}
|
|
||||||
influxdb_data: {}
|
|
@ -4,8 +4,26 @@ services:
|
|||||||
|
|
||||||
telegraf:
|
telegraf:
|
||||||
image: telegraf:1.18-alpine
|
image: telegraf:1.18-alpine
|
||||||
# image: telegraf:latest #(for amd64)
|
# image: telegraf:latest #(for arm64)
|
||||||
volumes:
|
volumes:
|
||||||
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
||||||
ports:
|
ports:
|
||||||
- '8125:8125/udp'
|
- '8125:8125/udp'
|
||||||
|
|
||||||
|
cadvisor:
|
||||||
|
image: gcr.io/cadvisor/cadvisor
|
||||||
|
# + image arm64
|
||||||
|
container_name: cadvisor
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:rw
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
|
# networks:
|
||||||
|
# - monitor-net
|
||||||
|
# labels:
|
||||||
|
# org.label-schema.group: "monitoring"
|
85
docker-compose.yml
Normal file
85
docker-compose.yml
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
version: '3.6'
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
influxdb:
|
||||||
|
image: influxdb:1.8-alpine
|
||||||
|
container_name: influxdb
|
||||||
|
restart: unless-stopped
|
||||||
|
env_file: .env
|
||||||
|
ports:
|
||||||
|
- '8086:8086'
|
||||||
|
volumes:
|
||||||
|
- influxdb_data:/var/lib/influxdb
|
||||||
|
- ./influxdb/imports:/imports
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:8.0.2
|
||||||
|
container_name: grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- influxdb
|
||||||
|
env_file: .env
|
||||||
|
links:
|
||||||
|
- influxdb
|
||||||
|
ports:
|
||||||
|
- '3000:3000'
|
||||||
|
volumes:
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
- ./grafana/provisioning/:/etc/grafana/provisioning/
|
||||||
|
- ./grafana/dashboards/:/var/lib/grafana/dashboards/
|
||||||
|
|
||||||
|
telegraf:
|
||||||
|
image: telegraf:1.18-alpine
|
||||||
|
# image: telegraf:latest #(for amd64)
|
||||||
|
container_name: telegraf
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro
|
||||||
|
depends_on:
|
||||||
|
- influxdb
|
||||||
|
links:
|
||||||
|
- influxdb
|
||||||
|
ports:
|
||||||
|
- '8125:8125/udp'
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v2.17.1
|
||||||
|
container_name: prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./prometheus:/etc/prometheus
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||||
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||||||
|
- '--storage.tsdb.retention.time=200h'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:v0.20.0
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./alertmanager:/etc/alertmanager
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
ports:
|
||||||
|
- "9093:9093"
|
||||||
|
|
||||||
|
pushgateway:
|
||||||
|
image: prom/pushgateway:v1.2.0
|
||||||
|
container_name: pushgateway
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "9091:9091"
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
grafana_data: {}
|
||||||
|
influxdb_data: {}
|
53
prometheus/prometheus.yml
Normal file
53
prometheus/prometheus.yml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: 'docker-host-alpha'
|
||||||
|
|
||||||
|
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||||
|
rule_files:
|
||||||
|
- "alert.rules"
|
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape.
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'nodeexporter'
|
||||||
|
scrape_interval: 5s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['nodeexporter:9100']
|
||||||
|
|
||||||
|
- job_name: 'cadvisor'
|
||||||
|
scrape_interval: 5s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['cadvisor:8080']
|
||||||
|
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
scrape_interval: 10s
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
- job_name: 'pushgateway'
|
||||||
|
scrape_interval: 10s
|
||||||
|
honor_labels: true
|
||||||
|
static_configs:
|
||||||
|
- targets: ['pushgateway:9091']
|
||||||
|
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 'alertmanager:9093'
|
||||||
|
|
||||||
|
# - job_name: 'nginx'
|
||||||
|
# scrape_interval: 10s
|
||||||
|
# static_configs:
|
||||||
|
# - targets: ['nginxexporter:9113']
|
||||||
|
|
||||||
|
# - job_name: 'aspnetcore'
|
||||||
|
# scrape_interval: 10s
|
||||||
|
# static_configs:
|
||||||
|
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']
|
@ -150,6 +150,8 @@
|
|||||||
## Comment this line if you want the raw CPU time metrics
|
## Comment this line if you want the raw CPU time metrics
|
||||||
fielddrop = ["time_*"]
|
fielddrop = ["time_*"]
|
||||||
|
|
||||||
|
[[inputs.temp]]
|
||||||
|
# no configuration
|
||||||
|
|
||||||
# Read metrics about disk usage by mount point
|
# Read metrics about disk usage by mount point
|
||||||
[[inputs.disk]]
|
[[inputs.disk]]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user