From ad12256bdc1bb829de16d4d80786426310631e62 Mon Sep 17 00:00:00 2001 From: greglebreton Date: Thu, 29 Dec 2022 23:06:34 +0100 Subject: [PATCH] push --- README.md | 51 +++ docker-compose-server.yml | 42 ++ docker-compose-telegraf-agent.yml | 11 + grafana/dashboards/performance.json | 373 ++++++++++++++++++ grafana/provisioning/dashboards/all.yml | 11 + grafana/provisioning/datasources/influxdb.yml | 46 +++ telegraf/telegraf.conf | 212 ++++++++++ 7 files changed, 746 insertions(+) create mode 100644 README.md create mode 100644 docker-compose-server.yml create mode 100644 docker-compose-telegraf-agent.yml create mode 100644 grafana/dashboards/performance.json create mode 100644 grafana/provisioning/dashboards/all.yml create mode 100644 grafana/provisioning/datasources/influxdb.yml create mode 100644 telegraf/telegraf.conf diff --git a/README.md b/README.md new file mode 100644 index 0000000..3c99898 --- /dev/null +++ b/README.md @@ -0,0 +1,51 @@ +# Telegraf-influxDB-grafana + +![LOGO](docs/logo.png) + +## Config + +- Grafana: grafana/provisioning/datasources/influxdb.yml +- InfluxDB: + +### Production use + +![PUSE](docs/production.png) + +## Usage + +- start stack +```bash +docker-compose up -d +``` + +- see logs +```bash +docker-compose logs -f +``` + +## Dashboards + +- AMD64: +- ARM64: + +--------------------- +## Services and Ports + +### Grafana + +- URL: http://localhost:3000 +- User: admin +- Password: admin + +### Telegraf + +- Port: 8125 UDP (StatsD input) + +### InfluxDB + +- Port: 8086 (HTTP API) +- User: admin +- Password: admin +- Database: influx + + diff --git a/docker-compose-server.yml b/docker-compose-server.yml new file mode 100644 index 0000000..6e53bfe --- /dev/null +++ b/docker-compose-server.yml @@ -0,0 +1,42 @@ +version: '3.6' + +services: + + influxdb: + image: influxdb:1.8-alpine + env_file: .env + ports: + - '8086:8086' + volumes: + - influxdb_data:/var/lib/influxdb + - ./influxdb/imports:/imports + + grafana: + image: grafana/grafana:8.0.2 + depends_on: + - influxdb + env_file: .env + links: + - influxdb + ports: + - '3000:3000' + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning/:/etc/grafana/provisioning/ + - ./grafana/dashboards/:/var/lib/grafana/dashboards/ + + telegraf: + image: telegraf:1.18-alpine + # image: telegraf:latest #(for amd64) + volumes: + - ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro + depends_on: + - influxdb + links: + - influxdb + ports: + - '8125:8125/udp' + +volumes: + grafana_data: {} + influxdb_data: {} diff --git a/docker-compose-telegraf-agent.yml b/docker-compose-telegraf-agent.yml new file mode 100644 index 0000000..c815162 --- /dev/null +++ b/docker-compose-telegraf-agent.yml @@ -0,0 +1,11 @@ +version: "3.6" + +services: + + telegraf: + image: telegraf:1.18-alpine + # image: telegraf:latest #(for amd64) + volumes: + - ./telegraf/telegraf.conf:/etc/telegraf/telegraf.conf:ro + ports: + - '8125:8125/udp' \ No newline at end of file diff --git a/grafana/dashboards/performance.json b/grafana/dashboards/performance.json new file mode 100644 index 0000000..407b763 --- /dev/null +++ b/grafana/dashboards/performance.json @@ -0,0 +1,373 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:7", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "InfluxDB", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 4, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_type", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "type" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "performance_request_successful_time", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "90_percentile" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Request Time", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:237", + "decimals": null, + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:238", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "InfluxDB", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "Request Type: $tag_type", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "type" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "performance_request_successful_count", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + }, + { + "params": [ + " / $__interval_ms*1000" + ], + "type": "math" + } + ] + ], + "tags": [] + }, + { + "alias": "All Types", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "hide": false, + "measurement": "performance_request_successful_count", + "orderByTime": "ASC", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "sum" + }, + { + "params": [ + " / $__interval_ms*1000" + ], + "type": "math" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests per Second", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:126", + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:127", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Performance", + "uid": "1Mar-DTiz", + "variables": { + "list": [] + }, + "version": 1 +} \ No newline at end of file diff --git a/grafana/provisioning/dashboards/all.yml b/grafana/provisioning/dashboards/all.yml new file mode 100644 index 0000000..0f62519 --- /dev/null +++ b/grafana/provisioning/dashboards/all.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/grafana/provisioning/datasources/influxdb.yml b/grafana/provisioning/datasources/influxdb.yml new file mode 100644 index 0000000..8a254bf --- /dev/null +++ b/grafana/provisioning/datasources/influxdb.yml @@ -0,0 +1,46 @@ +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Influxdb + orgId: 1 + +# list of datasources to insert/update depending +# whats available in the database +datasources: + # name of the datasource. Required +- name: InfluxDB + # datasource type. Required + type: influxdb + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://influxdb:8086 + # database password, if used + password: "admin" + # database user, if used + user: "admin" + # database name, if used + database: "influx" + # enable/disable basic auth + basicAuth: false +# withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + timeInterval: "5s" +# graphiteVersion: "1.1" +# tlsAuth: false +# tlsAuthWithCACert: false +# # json object of data that will be encrypted. +# secureJsonData: +# tlsCACert: "..." +# tlsClientCert: "..." +# tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: false diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf new file mode 100644 index 0000000..d2174de --- /dev/null +++ b/telegraf/telegraf.conf @@ -0,0 +1,212 @@ +# Telegraf configuration + +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. + +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. + +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "5s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will cache metric_buffer_limit metrics for each output, and will + ## flush this buffer on a successful write. + metric_buffer_limit = 10000 + ## Flush the buffer whenever full, regardless of flush_interval. + flush_buffer_when_full = true + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. You shouldn't set this below + ## interval. Maximum flush_interval will be flush_interval + flush_jitter + flush_interval = "1s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## Run telegraf in debug mode + debug = false + ## Run telegraf in quiet mode + quiet = false + ## Override default hostname, if empty use os.Hostname() + hostname = "lenovo" + + +############################################################################### +# OUTPUTS # +############################################################################### + +# Configuration for influxdb server to send metrics to +[[outputs.influxdb]] + # The full HTTP or UDP endpoint URL for your InfluxDB instance. + # Multiple urls can be specified but it is assumed that they are part of the same + # cluster, this means that only ONE of the urls will be written to each interval. + # urls = ["udp://localhost:8089"] # UDP endpoint example + urls = ["http://influxdb:8086"] # required + # The target database for metrics (telegraf will create it if not exists) + database = "influx" # required + # Precision of writes, valid values are "ns", "us" (or "µs"), "ms", "s", "m", "h". + # note: using second precision greatly helps InfluxDB compression + precision = "s" + + ## Write timeout (for the InfluxDB client), formatted as a string. + ## If not provided, will default to 5s. 0s means no timeout (not recommended). + timeout = "5s" + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + # Set the user agent for HTTP POSTs (can be useful for log differentiation) + # user_agent = "telegraf" + # Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes) + # udp_payload = 512 + + +############################################################################### +# INPUTS # +############################################################################### +# Statsd Server +[[inputs.statsd]] + ## Protocol, must be "tcp", "udp4", "udp6" or "udp" (default=udp) + protocol = "udp" + + ## MaxTCPConnection - applicable when protocol is set to tcp (default=250) + max_tcp_connections = 250 + + ## Enable TCP keep alive probes (default=false) + tcp_keep_alive = false + + ## Specifies the keep-alive period for an active network connection. + ## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false. + ## Defaults to the OS configuration. + # tcp_keep_alive_period = "2h" + + ## Address and port to host UDP listener on + service_address = ":8125" + + ## The following configuration options control when telegraf clears it's cache + ## of previous values. If set to false, then telegraf will only clear it's + ## cache when the daemon is restarted. + ## Reset gauges every interval (default=true) + delete_gauges = true + ## Reset counters every interval (default=true) + delete_counters = true + ## Reset sets every interval (default=true) + delete_sets = true + ## Reset timings & histograms every interval (default=true) + delete_timings = true + + ## Percentiles to calculate for timing & histogram stats + percentiles = [90] + + ## separator to use between elements of a statsd metric + metric_separator = "_" + + ## Parses tags in the datadog statsd format + ## http://docs.datadoghq.com/guides/dogstatsd/ + parse_data_dog_tags = false + + ## Statsd data translation templates, more info can be read here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#graphite + # templates = [ + # "cpu.* measurement*" + # ] + + ## Number of UDP messages allowed to queue up, once filled, + ## the statsd server will start dropping packets + allowed_pending_messages = 10000 + + ## Number of timing/histogram values to track per-measurement in the + ## calculation of percentiles. Raising this limit increases the accuracy + ## of percentiles but also increases the memory usage and cpu time. + percentile_limit = 1000 + + ## Maximum socket buffer size in bytes, once the buffer fills up, metrics + ## will start dropping. Defaults to the OS default. + # read_buffer_size = 65535 + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## Comment this line if you want the raw CPU time metrics + fielddrop = ["time_*"] + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default, telegraf gather stats for all mountpoints. + ## Setting mountpoints will restrict the stats to the specified mountpoints. + # mount_points = ["/"] + + ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually + ## present on /run, /var/run, /dev/shm or /dev). + ignore_fs = ["tmpfs", "devtmpfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + +# Read metrics about network interface usage +[[inputs.net]] + # collect data only about specific interfaces + # interfaces = ["eth0"] + + +[[inputs.netstat]] + # no configuration + +[[inputs.interrupts]] + # no configuration + +[[inputs.linux_sysctl_fs]] + # no configuration