Last active
April 23, 2024 19:52
-
-
Save pandeybk/eafcdbcb92cec30581268094f64e591a to your computer and use it in GitHub Desktop.
dcgm-dashboard.json
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "annotations": { | |
| "list": [ | |
| { | |
| "$$hashKey": "object:192", | |
| "builtIn": 1, | |
| "datasource": { | |
| "type": "datasource", | |
| "uid": "grafana" | |
| }, | |
| "enable": true, | |
| "hide": true, | |
| "iconColor": "rgba(0, 211, 255, 1)", | |
| "name": "Annotations & Alerts", | |
| "target": { | |
| "limit": 100, | |
| "matchAny": false, | |
| "tags": [], | |
| "type": "dashboard" | |
| }, | |
| "type": "dashboard" | |
| } | |
| ] | |
| }, | |
| "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", | |
| "editable": true, | |
| "fiscalYearStartMonth": 0, | |
| "gnetId": 12239, | |
| "graphTooltip": 0, | |
| "id": 2, | |
| "links": [], | |
| "liveNow": false, | |
| "panels": [ | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "custom": { | |
| "align": "auto", | |
| "displayMode": "auto", | |
| "inspect": false | |
| }, | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "red", | |
| "value": 80 | |
| } | |
| ] | |
| } | |
| }, | |
| "overrides": [ | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "device" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 115 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "gpu" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 78 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "exported_container" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 128 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "exported_pod" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 122 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "Namespace" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 158 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "Container" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 149 | |
| } | |
| ] | |
| } | |
| ] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 18, | |
| "x": 0, | |
| "y": 0 | |
| }, | |
| "id": 26, | |
| "options": { | |
| "footer": { | |
| "fields": "", | |
| "reducer": [ | |
| "sum" | |
| ], | |
| "show": false | |
| }, | |
| "showHeader": true, | |
| "sortBy": [] | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "format": "table", | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "A" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\".*\"}", | |
| "format": "table", | |
| "hide": false, | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "B" | |
| } | |
| ], | |
| "title": "GPU Utilization/ Node/ GPU", | |
| "transformations": [ | |
| { | |
| "id": "seriesToColumns", | |
| "options": { | |
| "byField": "instance" | |
| } | |
| }, | |
| { | |
| "id": "organize", | |
| "options": { | |
| "excludeByName": { | |
| "Hostname 2": true, | |
| "Time 1": true, | |
| "Time 2": true, | |
| "UUID 1": true, | |
| "UUID 2": true, | |
| "Value #B": true, | |
| "__name__ 1": true, | |
| "__name__ 2": true, | |
| "container 1": true, | |
| "container 2": true, | |
| "device 2": true, | |
| "endpoint 1": true, | |
| "endpoint 2": true, | |
| "exported_container 2": true, | |
| "exported_namespace 2": true, | |
| "exported_pod 2": true, | |
| "gpu 2": true, | |
| "instance": true, | |
| "job 1": true, | |
| "job 2": true, | |
| "modelName 1": true, | |
| "modelName 2": true, | |
| "namespace 1": true, | |
| "namespace 2": true, | |
| "pod 1": true, | |
| "pod 2": true, | |
| "service 1": true, | |
| "service 2": true | |
| }, | |
| "indexByName": { | |
| "Hostname 1": 1, | |
| "Hostname 2": 19, | |
| "Time 1": 3, | |
| "Time 2": 18, | |
| "UUID 1": 4, | |
| "UUID 2": 20, | |
| "Value #A": 17, | |
| "Value #B": 34, | |
| "__name__ 1": 5, | |
| "__name__ 2": 21, | |
| "container 1": 6, | |
| "container 2": 22, | |
| "device 1": 7, | |
| "device 2": 23, | |
| "endpoint 1": 9, | |
| "endpoint 2": 24, | |
| "exported_container 1": 12, | |
| "exported_container 2": 25, | |
| "exported_namespace 1": 10, | |
| "exported_namespace 2": 26, | |
| "exported_pod 1": 11, | |
| "exported_pod 2": 27, | |
| "gpu 1": 8, | |
| "gpu 2": 28, | |
| "instance": 0, | |
| "job 1": 13, | |
| "job 2": 29, | |
| "modelName 1": 2, | |
| "modelName 2": 30, | |
| "namespace 1": 14, | |
| "namespace 2": 31, | |
| "pod 1": 15, | |
| "pod 2": 32, | |
| "service 1": 16, | |
| "service 2": 33 | |
| }, | |
| "renameByName": { | |
| "Value #A": "GPU Utilization", | |
| "container 1": "", | |
| "exported_container 1": "Container", | |
| "exported_namespace 1": "Namespace", | |
| "exported_pod 1": "Pod", | |
| "gpu 1": "" | |
| } | |
| } | |
| } | |
| ], | |
| "type": "table" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "mappings": [], | |
| "max": 100, | |
| "min": 0, | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "#EAB839", | |
| "value": 83 | |
| }, | |
| { | |
| "color": "red", | |
| "value": 87 | |
| } | |
| ] | |
| }, | |
| "unit": "celsius" | |
| }, | |
| "overrides": [] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 6, | |
| "x": 18, | |
| "y": 0 | |
| }, | |
| "id": 14, | |
| "options": { | |
| "orientation": "auto", | |
| "reduceOptions": { | |
| "calcs": [ | |
| "mean" | |
| ], | |
| "fields": "", | |
| "values": false | |
| }, | |
| "showThresholdLabels": false, | |
| "showThresholdMarkers": true | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "editorMode": "code", | |
| "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})", | |
| "interval": "", | |
| "legendFormat": "", | |
| "range": true, | |
| "refId": "A" | |
| } | |
| ], | |
| "title": "GPU Avg. Temp", | |
| "type": "gauge" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "custom": { | |
| "align": "auto", | |
| "displayMode": "auto", | |
| "inspect": false | |
| }, | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "red", | |
| "value": 80 | |
| } | |
| ] | |
| } | |
| }, | |
| "overrides": [ | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "{Hostname=\"worker1.cloud9c.xtoph156.dfw.ocp.run\", UUID=\"GPU-0d986ca6-0fdf-cd1a-7022-e89ad0d55f61\", __name__=\"DCGM_FI_DEV_GPU_UTIL\", container=\"nvidia-dcgm-exporter\", device=\"nvidia0\", endpoint=\"gpu-metrics\", exported_container=\"llama2\", exported_namespace=\"finetune\", exported_pod=\"llama2-0\", gpu=\"0\", instance=\"192.168.156.136:9400\", job=\"nvidia-dcgm-exporter\", modelName=\"Tesla V100-PCIE-16GB\", namespace=\"nvidia-gpu-operator\", pod=\"nvidia-dcgm-exporter-2s2p8\", service=\"nvidia-dcgm-exporter\"}" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 299 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "__name__ 2" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 231 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "container 1" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 226 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "job 1" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 208 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "modelName" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 164 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "exported_namespace" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 151 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "instance" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 112 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "Hostname" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 200 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "gpu" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 72 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "GPU Util" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 57 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "device" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 101 | |
| } | |
| ] | |
| } | |
| ] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 18, | |
| "x": 0, | |
| "y": 8 | |
| }, | |
| "id": 22, | |
| "options": { | |
| "footer": { | |
| "fields": "", | |
| "reducer": [ | |
| "sum" | |
| ], | |
| "show": false | |
| }, | |
| "showHeader": true, | |
| "sortBy": [] | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "exemplar": false, | |
| "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"}", | |
| "format": "table", | |
| "instant": false, | |
| "interval": "", | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "A" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_COUNT{instance=~\"$instance\"}", | |
| "format": "table", | |
| "hide": false, | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "B" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\"}", | |
| "format": "table", | |
| "hide": false, | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "C" | |
| } | |
| ], | |
| "title": "Processes/ Node", | |
| "transformations": [ | |
| { | |
| "id": "seriesToColumns", | |
| "options": { | |
| "byField": "instance" | |
| } | |
| }, | |
| { | |
| "id": "organize", | |
| "options": { | |
| "excludeByName": { | |
| "Hostname 2": true, | |
| "Hostname 3": true, | |
| "Time 1": true, | |
| "Time 2": true, | |
| "Time 3": true, | |
| "UUID 1": true, | |
| "UUID 2": true, | |
| "UUID 3": true, | |
| "Value #A": false, | |
| "__name__ 1": true, | |
| "__name__ 2": true, | |
| "__name__ 3": true, | |
| "container 1": true, | |
| "container 2": true, | |
| "container 3": true, | |
| "device 2": true, | |
| "device 3": true, | |
| "endpoint 1": true, | |
| "endpoint 2": true, | |
| "endpoint 3": true, | |
| "exported_container 2": true, | |
| "exported_container 3": true, | |
| "exported_namespace 2": true, | |
| "exported_namespace 3": true, | |
| "exported_pod 1": false, | |
| "exported_pod 2": true, | |
| "exported_pod 3": true, | |
| "gpu 1": false, | |
| "gpu 2": true, | |
| "gpu 3": true, | |
| "job 1": true, | |
| "job 2": true, | |
| "job 3": true, | |
| "modelName 2": true, | |
| "modelName 3": true, | |
| "namespace 1": true, | |
| "namespace 2": true, | |
| "namespace 3": true, | |
| "pod 1": true, | |
| "pod 2": true, | |
| "pod 3": true, | |
| "service 1": true, | |
| "service 2": true, | |
| "service 3": true | |
| }, | |
| "indexByName": { | |
| "Hostname 1": 2, | |
| "Hostname 2": 19, | |
| "Time 1": 1, | |
| "Time 2": 18, | |
| "UUID 1": 3, | |
| "UUID 2": 20, | |
| "Value #A": 17, | |
| "Value #B": 34, | |
| "__name__ 1": 4, | |
| "__name__ 2": 21, | |
| "container 1": 5, | |
| "container 2": 22, | |
| "device 1": 6, | |
| "device 2": 23, | |
| "endpoint 1": 7, | |
| "endpoint 2": 24, | |
| "exported_container 1": 8, | |
| "exported_container 2": 25, | |
| "exported_namespace 1": 9, | |
| "exported_namespace 2": 26, | |
| "exported_pod 1": 11, | |
| "exported_pod 2": 27, | |
| "gpu 1": 12, | |
| "gpu 2": 28, | |
| "instance": 0, | |
| "job 1": 13, | |
| "job 2": 29, | |
| "modelName 1": 10, | |
| "modelName 2": 30, | |
| "namespace 1": 14, | |
| "namespace 2": 31, | |
| "pod 1": 15, | |
| "pod 2": 32, | |
| "service 1": 16, | |
| "service 2": 33 | |
| }, | |
| "renameByName": { | |
| "Value #A": "GPU Util", | |
| "Value #B": "GPU Count", | |
| "Value #C": "Temperature" | |
| } | |
| } | |
| } | |
| ], | |
| "type": "table" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "red", | |
| "value": 80 | |
| } | |
| ] | |
| } | |
| }, | |
| "overrides": [] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 6, | |
| "x": 18, | |
| "y": 8 | |
| }, | |
| "id": 20, | |
| "options": { | |
| "colorMode": "none", | |
| "graphMode": "none", | |
| "justifyMode": "auto", | |
| "orientation": "horizontal", | |
| "reduceOptions": { | |
| "calcs": [ | |
| "mean" | |
| ], | |
| "fields": "", | |
| "values": false | |
| }, | |
| "textMode": "auto" | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "editorMode": "code", | |
| "expr": "count(DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"})", | |
| "format": "time_series", | |
| "range": true, | |
| "refId": "A" | |
| } | |
| ], | |
| "title": "Total Number of GPUs", | |
| "type": "stat" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "custom": { | |
| "align": "auto", | |
| "displayMode": "auto", | |
| "inspect": false | |
| }, | |
| "mappings": [], | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "red", | |
| "value": 80 | |
| } | |
| ] | |
| } | |
| }, | |
| "overrides": [ | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "__name__" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 280 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "modelName" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 187 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "instance" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 172 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "Hostname" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 293 | |
| } | |
| ] | |
| }, | |
| { | |
| "matcher": { | |
| "id": "byName", | |
| "options": "device" | |
| }, | |
| "properties": [ | |
| { | |
| "id": "custom.width", | |
| "value": 104 | |
| } | |
| ] | |
| } | |
| ] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 18, | |
| "x": 0, | |
| "y": 16 | |
| }, | |
| "id": 24, | |
| "options": { | |
| "footer": { | |
| "fields": "", | |
| "reducer": [ | |
| "sum" | |
| ], | |
| "show": false | |
| }, | |
| "showHeader": true, | |
| "sortBy": [] | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "count by (instance) (DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\"})", | |
| "format": "table", | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "A" | |
| }, | |
| { | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "kwiMtNfSk" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_GPU_UTIL", | |
| "format": "table", | |
| "hide": false, | |
| "legendFormat": "__auto", | |
| "range": true, | |
| "refId": "B" | |
| } | |
| ], | |
| "title": "Number of GPUs/ Node", | |
| "transformations": [ | |
| { | |
| "id": "seriesToColumns", | |
| "options": { | |
| "byField": "instance" | |
| } | |
| }, | |
| { | |
| "id": "organize", | |
| "options": { | |
| "excludeByName": { | |
| "Time": true, | |
| "Time 1": true, | |
| "Time 2": true, | |
| "UUID": true, | |
| "Value #B": true, | |
| "__name__": true, | |
| "container": true, | |
| "device": true, | |
| "endpoint": true, | |
| "exported_container": true, | |
| "exported_namespace": true, | |
| "exported_pod": true, | |
| "gpu": true, | |
| "job": true, | |
| "namespace": true, | |
| "pod": true, | |
| "service": true | |
| }, | |
| "indexByName": { | |
| "Hostname": 0, | |
| "Time 1": 3, | |
| "Time 2": 4, | |
| "UUID": 2, | |
| "Value #A": 19, | |
| "Value #B": 18, | |
| "__name__": 5, | |
| "container": 6, | |
| "device": 7, | |
| "endpoint": 9, | |
| "exported_container": 10, | |
| "exported_namespace": 11, | |
| "exported_pod": 12, | |
| "gpu": 13, | |
| "instance": 1, | |
| "job": 14, | |
| "modelName": 8, | |
| "namespace": 15, | |
| "pod": 16, | |
| "service": 17 | |
| }, | |
| "renameByName": { | |
| "Value #A": "Total GPUs", | |
| "instance": "" | |
| } | |
| } | |
| } | |
| ], | |
| "type": "table" | |
| }, | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "color": { | |
| "mode": "thresholds" | |
| }, | |
| "mappings": [], | |
| "max": 2400, | |
| "min": 0, | |
| "thresholds": { | |
| "mode": "absolute", | |
| "steps": [ | |
| { | |
| "color": "green", | |
| "value": null | |
| }, | |
| { | |
| "color": "#EAB839", | |
| "value": 1800 | |
| }, | |
| { | |
| "color": "red", | |
| "value": 2200 | |
| } | |
| ] | |
| }, | |
| "unit": "watt" | |
| }, | |
| "overrides": [] | |
| }, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 6, | |
| "x": 18, | |
| "y": 16 | |
| }, | |
| "id": 16, | |
| "links": [], | |
| "options": { | |
| "orientation": "horizontal", | |
| "reduceOptions": { | |
| "calcs": [ | |
| "sum" | |
| ], | |
| "fields": "", | |
| "values": false | |
| }, | |
| "showThresholdLabels": false, | |
| "showThresholdMarkers": true | |
| }, | |
| "pluginVersion": "9.1.6", | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})", | |
| "instant": true, | |
| "interval": "", | |
| "legendFormat": "", | |
| "range": false, | |
| "refId": "A" | |
| } | |
| ], | |
| "title": "GPU Power Total", | |
| "type": "gauge" | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 11, | |
| "x": 0, | |
| "y": 24 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 12, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "instant": false, | |
| "interval": "", | |
| "legendFormat": "GPU {{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "GPU Temperature", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "celsius", | |
| "logBase": 1, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 13, | |
| "x": 11, | |
| "y": 24 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 10, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "interval": "", | |
| "legendFormat": "GPU {{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "GPU Power Usage", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "watt", | |
| "logBase": 1, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 11, | |
| "x": 0, | |
| "y": 32 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 2, | |
| "interval": "", | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000", | |
| "format": "time_series", | |
| "interval": "", | |
| "intervalFactor": 1, | |
| "legendFormat": "GPU {{gpu}}", | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "GPU SM Clocks", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "hertz", | |
| "label": "", | |
| "logBase": 1, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 13, | |
| "x": 11, | |
| "y": 32 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 6, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "interval": "", | |
| "legendFormat": "GPU {{gpu}}", | |
| "range": true, | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "GPU Utilization", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "cumulative" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "percent", | |
| "logBase": 1, | |
| "max": "100", | |
| "min": "0", | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 11, | |
| "x": 0, | |
| "y": 40 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 4, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "interval": "", | |
| "legendFormat": "GPU {{gpu}}", | |
| "range": true, | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "Tensor Core Utilization", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "cumulative" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "percentunit", | |
| "logBase": 1, | |
| "max": "1", | |
| "min": "0", | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| }, | |
| { | |
| "aliasColors": {}, | |
| "bars": false, | |
| "dashLength": 10, | |
| "dashes": false, | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "fieldConfig": { | |
| "defaults": { | |
| "links": [] | |
| }, | |
| "overrides": [] | |
| }, | |
| "fill": 1, | |
| "fillGradient": 0, | |
| "gridPos": { | |
| "h": 8, | |
| "w": 13, | |
| "x": 11, | |
| "y": 40 | |
| }, | |
| "hiddenSeries": false, | |
| "id": 18, | |
| "legend": { | |
| "alignAsTable": true, | |
| "avg": true, | |
| "current": true, | |
| "max": true, | |
| "min": false, | |
| "rightSide": true, | |
| "show": true, | |
| "total": false, | |
| "values": true | |
| }, | |
| "lines": true, | |
| "linewidth": 2, | |
| "nullPointMode": "null", | |
| "options": { | |
| "alertThreshold": true | |
| }, | |
| "percentage": false, | |
| "pluginVersion": "9.1.6", | |
| "pointradius": 2, | |
| "points": false, | |
| "renderer": "flot", | |
| "seriesOverrides": [], | |
| "spaceLength": 10, | |
| "stack": false, | |
| "steppedLine": false, | |
| "targets": [ | |
| { | |
| "datasource": { | |
| "uid": "$datasource" | |
| }, | |
| "editorMode": "code", | |
| "expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}", | |
| "interval": "", | |
| "legendFormat": "GPU {{gpu}}", | |
| "range": true, | |
| "refId": "A" | |
| } | |
| ], | |
| "thresholds": [], | |
| "timeRegions": [], | |
| "title": "GPU Framebuffer Mem Used", | |
| "tooltip": { | |
| "shared": true, | |
| "sort": 0, | |
| "value_type": "individual" | |
| }, | |
| "type": "graph", | |
| "xaxis": { | |
| "mode": "time", | |
| "show": true, | |
| "values": [] | |
| }, | |
| "yaxes": [ | |
| { | |
| "format": "decmbytes", | |
| "logBase": 1, | |
| "show": true | |
| }, | |
| { | |
| "format": "short", | |
| "logBase": 1, | |
| "show": true | |
| } | |
| ], | |
| "yaxis": { | |
| "align": false | |
| } | |
| } | |
| ], | |
| "refresh": false, | |
| "schemaVersion": 37, | |
| "style": "dark", | |
| "tags": [], | |
| "templating": { | |
| "list": [ | |
| { | |
| "current": { | |
| "selected": false, | |
| "text": "Prometheus", | |
| "value": "Prometheus" | |
| }, | |
| "hide": 0, | |
| "includeAll": false, | |
| "multi": false, | |
| "name": "datasource", | |
| "options": [], | |
| "query": "prometheus", | |
| "queryValue": "", | |
| "refresh": 1, | |
| "regex": "", | |
| "skipUrlSync": false, | |
| "type": "datasource" | |
| }, | |
| { | |
| "current": { | |
| "selected": false, | |
| "text": "All", | |
| "value": "$__all" | |
| }, | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", | |
| "hide": 0, | |
| "includeAll": true, | |
| "multi": true, | |
| "name": "instance", | |
| "options": [], | |
| "query": { | |
| "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", | |
| "refId": "Prometheus-instance-Variable-Query" | |
| }, | |
| "refresh": 1, | |
| "regex": "", | |
| "skipUrlSync": false, | |
| "sort": 1, | |
| "tagValuesQuery": "", | |
| "tagsQuery": "", | |
| "type": "query", | |
| "useTags": false | |
| }, | |
| { | |
| "current": { | |
| "selected": false, | |
| "text": "All", | |
| "value": "$__all" | |
| }, | |
| "datasource": { | |
| "type": "prometheus", | |
| "uid": "$datasource" | |
| }, | |
| "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", | |
| "hide": 0, | |
| "includeAll": true, | |
| "multi": true, | |
| "name": "gpu", | |
| "options": [], | |
| "query": { | |
| "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", | |
| "refId": "Prometheus-gpu-Variable-Query" | |
| }, | |
| "refresh": 1, | |
| "regex": "", | |
| "skipUrlSync": false, | |
| "sort": 1, | |
| "tagValuesQuery": "", | |
| "tagsQuery": "", | |
| "type": "query", | |
| "useTags": false | |
| } | |
| ] | |
| }, | |
| "time": { | |
| "from": "now-15m", | |
| "to": "now" | |
| }, | |
| "timepicker": { | |
| "refresh_intervals": [ | |
| "5s", | |
| "10s", | |
| "30s", | |
| "1m", | |
| "5m", | |
| "15m", | |
| "30m", | |
| "1h", | |
| "2h", | |
| "1d" | |
| ] | |
| }, | |
| "timezone": "", | |
| "title": "NVIDIA DCGM Exporter Dashboard V2", | |
| "uid": "Oxed_c6Wz", | |
| "version": 14, | |
| "weekStart": "" | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment