Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save christopherpaquin/3c500b462d72cb04caf5f0dabda8e5cf to your computer and use it in GitHub Desktop.

Select an option

Save christopherpaquin/3c500b462d72cb04caf5f0dabda8e5cf to your computer and use it in GitHub Desktop.
Template-NVIDIA-nvidia-smi.xml
<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>7.4</version>
<template_groups>
<template_group>
<name>Templates/Hardware</name>
</template_group>
</template_groups>
<templates>
<template>
<template>Template NVIDIA GPU (nvidia-smi UserParameter)</template>
<name>Template NVIDIA GPU (nvidia-smi UserParameter)</name>
<groups>
<group>
<name>Templates/Hardware</name>
</group>
</groups>
<description>Monitors NVIDIA GPUs via Zabbix agent UserParameters calling nvidia-smi.
Requires keys:
- nvidia.discovery
- nvidia.metric[INDEX,metric]</description>
<macros>
<macro>
<macro>{$GPU.NODATA}</macro>
<value>10m</value>
<description>Alert if GPU metrics stop arriving for this long.</description>
</macro>
<macro>
<macro>{$GPU.TEMP.WARN}</macro>
<value>80</value>
<description>Temperature warning threshold (°C).</description>
</macro>
<macro>
<macro>{$GPU.TEMP.HIGH}</macro>
<value>85</value>
<description>Temperature high/critical threshold (°C).</description>
</macro>
<macro>
<macro>{$GPU.POWER.HIGH}</macro>
<value>70</value>
<description>Power draw high/critical threshold (W). Tesla T4 is typically 70W cap.</description>
</macro>
<macro>
<macro>{$GPU.MEMUTIL.HIGH}</macro>
<value>95</value>
<description>Memory utilization high threshold (%).</description>
</macro>
<macro>
<macro>{$GPU.UTIL.STUCK.HIGH}</macro>
<value>99</value>
<description>GPU utilization 'stuck high' threshold (%).</description>
</macro>
<macro>
<macro>{$GPU.UTIL.STUCK.DUR}</macro>
<value>10m</value>
<description>Duration for utilization 'stuck high' check.</description>
</macro>
</macros>
<discovery_rules>
<discovery_rule>
<name>NVIDIA GPU discovery</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.discovery</key>
<delay>1h</delay>
<lifetime>7d</lifetime>
<description>Discovers GPUs using nvidia.discovery JSON.</description>
<item_prototypes>
<item_prototype>
<name>GPU {#GPUINDEX}: Utilization</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},util]</key>
<delay>30s</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>%</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Temperature</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},temp]</key>
<delay>30s</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>°C</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Memory used</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},mem.used]</key>
<delay>30s</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>MiB</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Memory total</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},mem.total]</key>
<delay>1h</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>MiB</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Memory utilization</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},mem.util]</key>
<delay>30s</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>%</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Power draw</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},power]</key>
<delay>30s</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>W</units>
</item_prototype>
<item_prototype>
<name>GPU {#GPUINDEX}: Power limit</name>
<type>ZABBIX_ACTIVE</type>
<key>nvidia.metric[{#GPUINDEX},power.limit]</key>
<delay>1h</delay>
<history>7d</history>
<trends>365d</trends>
<value_type>FLOAT</value_type>
<units>W</units>
</item_prototype>
</item_prototypes>
<trigger_prototypes>
<!-- Health: missing data -->
<trigger_prototype>
<name>GPU {#GPUINDEX}: No data (nvidia-smi/agent script failing)</name>
<severity>HIGH</severity>
<expression>nodata(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.NODATA})=1</expression>
<description>No GPU utilization data received within {$GPU.NODATA}. This usually means nvidia-smi failed or the UserParameter script is failing.</description>
</trigger_prototype>
<!-- Health: temperature -->
<trigger_prototype>
<name>GPU {#GPUINDEX}: Temperature warning (&gt; {$GPU.TEMP.WARN}°C)</name>
<severity>WARNING</severity>
<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])&gt;{$GPU.TEMP.WARN}</expression>
</trigger_prototype>
<trigger_prototype>
<name>GPU {#GPUINDEX}: Temperature HIGH (&gt; {$GPU.TEMP.HIGH}°C)</name>
<severity>HIGH</severity>
<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])&gt;{$GPU.TEMP.HIGH}</expression>
</trigger_prototype>
<!-- Health: power draw -->
<trigger_prototype>
<name>GPU {#GPUINDEX}: Power draw HIGH (&gt; {$GPU.POWER.HIGH}W)</name>
<severity>HIGH</severity>
<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},power])&gt;{$GPU.POWER.HIGH}</expression>
<description>Power draw exceeds configured threshold. For Tesla T4, typical cap is ~70W.</description>
</trigger_prototype>
<!-- Health: memory util high -->
<trigger_prototype>
<name>GPU {#GPUINDEX}: Memory utilization HIGH (&gt; {$GPU.MEMUTIL.HIGH}%)</name>
<severity>WARNING</severity>
<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},mem.util])&gt;{$GPU.MEMUTIL.HIGH}</expression>
</trigger_prototype>
<!-- Health: utilization stuck high -->
<trigger_prototype>
<name>GPU {#GPUINDEX}: Utilization stuck high (&gt; {$GPU.UTIL.STUCK.HIGH}% for {$GPU.UTIL.STUCK.DUR})</name>
<severity>WARNING</severity>
<expression>min(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.UTIL.STUCK.DUR})&gt;={$GPU.UTIL.STUCK.HIGH}</expression>
<description>Helps catch runaway processes. Adjust or disable if this is expected.</description>
</trigger_prototype>
</trigger_prototypes>
</discovery_rule>
</discovery_rules>
</template>
</templates>
</zabbix_export>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment