Created
January 23, 2026 04:10
-
-
Save christopherpaquin/3c500b462d72cb04caf5f0dabda8e5cf to your computer and use it in GitHub Desktop.
Template-NVIDIA-nvidia-smi.xml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?xml version="1.0" encoding="UTF-8"?> | |
| <zabbix_export> | |
| <version>7.4</version> | |
| <template_groups> | |
| <template_group> | |
| <name>Templates/Hardware</name> | |
| </template_group> | |
| </template_groups> | |
| <templates> | |
| <template> | |
| <template>Template NVIDIA GPU (nvidia-smi UserParameter)</template> | |
| <name>Template NVIDIA GPU (nvidia-smi UserParameter)</name> | |
| <groups> | |
| <group> | |
| <name>Templates/Hardware</name> | |
| </group> | |
| </groups> | |
| <description>Monitors NVIDIA GPUs via Zabbix agent UserParameters calling nvidia-smi. | |
| Requires keys: | |
| - nvidia.discovery | |
| - nvidia.metric[INDEX,metric]</description> | |
| <macros> | |
| <macro> | |
| <macro>{$GPU.NODATA}</macro> | |
| <value>10m</value> | |
| <description>Alert if GPU metrics stop arriving for this long.</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.TEMP.WARN}</macro> | |
| <value>80</value> | |
| <description>Temperature warning threshold (°C).</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.TEMP.HIGH}</macro> | |
| <value>85</value> | |
| <description>Temperature high/critical threshold (°C).</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.POWER.HIGH}</macro> | |
| <value>70</value> | |
| <description>Power draw high/critical threshold (W). Tesla T4 is typically 70W cap.</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.MEMUTIL.HIGH}</macro> | |
| <value>95</value> | |
| <description>Memory utilization high threshold (%).</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.UTIL.STUCK.HIGH}</macro> | |
| <value>99</value> | |
| <description>GPU utilization 'stuck high' threshold (%).</description> | |
| </macro> | |
| <macro> | |
| <macro>{$GPU.UTIL.STUCK.DUR}</macro> | |
| <value>10m</value> | |
| <description>Duration for utilization 'stuck high' check.</description> | |
| </macro> | |
| </macros> | |
| <discovery_rules> | |
| <discovery_rule> | |
| <name>NVIDIA GPU discovery</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.discovery</key> | |
| <delay>1h</delay> | |
| <lifetime>7d</lifetime> | |
| <description>Discovers GPUs using nvidia.discovery JSON.</description> | |
| <item_prototypes> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Utilization</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},util]</key> | |
| <delay>30s</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>%</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Temperature</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},temp]</key> | |
| <delay>30s</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>°C</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Memory used</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},mem.used]</key> | |
| <delay>30s</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>MiB</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Memory total</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},mem.total]</key> | |
| <delay>1h</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>MiB</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Memory utilization</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},mem.util]</key> | |
| <delay>30s</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>%</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Power draw</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},power]</key> | |
| <delay>30s</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>W</units> | |
| </item_prototype> | |
| <item_prototype> | |
| <name>GPU {#GPUINDEX}: Power limit</name> | |
| <type>ZABBIX_ACTIVE</type> | |
| <key>nvidia.metric[{#GPUINDEX},power.limit]</key> | |
| <delay>1h</delay> | |
| <history>7d</history> | |
| <trends>365d</trends> | |
| <value_type>FLOAT</value_type> | |
| <units>W</units> | |
| </item_prototype> | |
| </item_prototypes> | |
| <trigger_prototypes> | |
| <!-- Health: missing data --> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: No data (nvidia-smi/agent script failing)</name> | |
| <severity>HIGH</severity> | |
| <expression>nodata(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.NODATA})=1</expression> | |
| <description>No GPU utilization data received within {$GPU.NODATA}. This usually means nvidia-smi failed or the UserParameter script is failing.</description> | |
| </trigger_prototype> | |
| <!-- Health: temperature --> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: Temperature warning (> {$GPU.TEMP.WARN}°C)</name> | |
| <severity>WARNING</severity> | |
| <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])>{$GPU.TEMP.WARN}</expression> | |
| </trigger_prototype> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: Temperature HIGH (> {$GPU.TEMP.HIGH}°C)</name> | |
| <severity>HIGH</severity> | |
| <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])>{$GPU.TEMP.HIGH}</expression> | |
| </trigger_prototype> | |
| <!-- Health: power draw --> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: Power draw HIGH (> {$GPU.POWER.HIGH}W)</name> | |
| <severity>HIGH</severity> | |
| <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},power])>{$GPU.POWER.HIGH}</expression> | |
| <description>Power draw exceeds configured threshold. For Tesla T4, typical cap is ~70W.</description> | |
| </trigger_prototype> | |
| <!-- Health: memory util high --> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: Memory utilization HIGH (> {$GPU.MEMUTIL.HIGH}%)</name> | |
| <severity>WARNING</severity> | |
| <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},mem.util])>{$GPU.MEMUTIL.HIGH}</expression> | |
| </trigger_prototype> | |
| <!-- Health: utilization stuck high --> | |
| <trigger_prototype> | |
| <name>GPU {#GPUINDEX}: Utilization stuck high (> {$GPU.UTIL.STUCK.HIGH}% for {$GPU.UTIL.STUCK.DUR})</name> | |
| <severity>WARNING</severity> | |
| <expression>min(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.UTIL.STUCK.DUR})>={$GPU.UTIL.STUCK.HIGH}</expression> | |
| <description>Helps catch runaway processes. Adjust or disable if this is expected.</description> | |
| </trigger_prototype> | |
| </trigger_prototypes> | |
| </discovery_rule> | |
| </discovery_rules> | |
| </template> | |
| </templates> | |
| </zabbix_export> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment