Prometheus is pretty easy! I had never set up anything similar before either, and I found it quite straightforward. For reference, this is what my config looks like, with some irrelevant stuff removed:
global:
scrape_interval: 1m
evaluation_interval: 1m
scrape_configs:
- job_name: 'netdata'
metrics_path: '/api/v1/allmetrics'
params:
format: [prometheus_all_hosts]
honor_labels: true
static_configs:
- targets: [
'vps03.vpn.d.sb:19999',
'vps07.vpn.d.sb:19999',
'vps10.vpn.d.sb:19999',
'vps11.vpn.d.sb:19999'
]
And the command I’m using to run it:
prometheus --storage.tsdb.retention=365d --log.level=info --web.listen-address=127.0.0.1:9090
I’m running it on a Windows server, but it’d be similar on Linux. You’d just configure the command line via systemd
, or use a package that does that for you automatically.
Currently I’ve got data from 4th January until today, across four Netdata servers (some added recently though), one Windows server using wmi_exporter
, and a few other various things I’m monitoring, and my Prometheus data
directory is around 2.9 GB in size. I’m scraping the data every minute. If needed, you could reduce the size even more by only scraping certain metrics - I’m scraping all Netdata’s metrics into Prometheus.
Grafana is even easier - Once installed and running, everything is configured in its web UI. Alerting in particular is easier to configure - You configure it in the UI while looking at a graph, rather than having to edit a YAML file.
Here’s one of my Grafana dashboards, for inspiration. It shows CPU, RAM and disk usage across all my VPSes:
https://dash.d.sb/d/yLPMYDwik/all-servers
Here’s the JSON for that dashboard, if you want it (you can import it into your own Grafana instance):
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 0,
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 0
},
"id": 2,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "100 - avg(netdata_cpu_cpu_percentage_average{dimension=\"idle\"}) by (instance)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "A"
},
{
"expr": "100 - 100 / sum(rate(wmi_cpu_time_total[5m])) by (instance) * sum(rate(wmi_cpu_time_total{mode=\"idle\"}[5m])) by (instance)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 6
},
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "netdata_mem_available_MiB_average",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "A"
},
{
"expr": "wmi_os_physical_memory_free_bytes / 1024 / 1024",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Available Memory",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "mbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"alert": {
"conditions": [
{
"evaluator": {
"params": [
5
],
"type": "lt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "5m",
"frequency": "1m",
"handler": 1,
"message": "Free disk space is low!",
"name": "Free Disk Space on Primary Disk alert",
"noDataState": "no_data",
"notifications": [
{
"id": 1
}
]
},
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 6
},
"id": 4,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "netdata_disk_space_GiB_average{family=\"/\", dimension=\"avail\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "A"
},
{
"expr": "wmi_logical_disk_free_bytes{volume=\"C:\"} / 1024 / 1024 / 1024",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "B"
}
],
"thresholds": [
{
"colorMode": "critical",
"fill": true,
"line": true,
"op": "lt",
"value": 5
}
],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Free Disk Space on Primary Disk",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "decgbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"fill": 1,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 12
},
"id": 8,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/.+ sent/",
"transform": "negative-Y"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "netdata_system_net_kilobits_persec_average{dimension=\"received\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} received",
"refId": "A"
},
{
"expr": "-netdata_system_net_kilobits_persec_average{dimension=\"sent\"}",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} sent",
"refId": "B"
},
{
"expr": "sum(rate(wmi_net_bytes_sent_total[5m])) by (instance) / 1024",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} sent",
"refId": "C"
},
{
"expr": "sum(rate(wmi_net_bytes_received_total[5m])) by (instance) / 1024",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{instance}} received",
"refId": "D"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Network Bandwidth",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Kbits",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 16,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "All Servers",
"uid": "yLPMYDwik",
"version": 15
}
I wouldn’t bother with node_exporter
if you’re using Netdata too - Instead just configure Prometheus to scrape from Netdata.