christopherpaquin · January 23, 2026 04:10
diff --git a/gistfile1.txt b/gistfile1.txt
 <?xml version="1.0" encoding="UTF-8"?>
 <zabbix_export>
  <version>7.4</version>
  <template_groups>
    <template_group>
      <name>Templates/Hardware</name>
    </template_group>
  </template_groups>

  <templates>
    <template>
      <template>Template NVIDIA GPU (nvidia-smi UserParameter)</template>
      <name>Template NVIDIA GPU (nvidia-smi UserParameter)</name>
      <groups>
        <group>
          <name>Templates/Hardware</name>
        </group>
      </groups>
      <description>Monitors NVIDIA GPUs via Zabbix agent UserParameters calling nvidia-smi.
 Requires keys:
 - nvidia.discovery
 - nvidia.metric[INDEX,metric]</description>

      <macros>
        <macro>
          <macro>{$GPU.NODATA}</macro>
          <value>10m</value>
          <description>Alert if GPU metrics stop arriving for this long.</description>
        </macro>
        <macro>
          <macro>{$GPU.TEMP.WARN}</macro>
          <value>80</value>
          <description>Temperature warning threshold (°C).</description>
        </macro>
        <macro>
          <macro>{$GPU.TEMP.HIGH}</macro>
          <value>85</value>
          <description>Temperature high/critical threshold (°C).</description>
        </macro>
        <macro>
          <macro>{$GPU.POWER.HIGH}</macro>
          <value>70</value>
          <description>Power draw high/critical threshold (W). Tesla T4 is typically 70W cap.</description>
        </macro>
        <macro>
          <macro>{$GPU.MEMUTIL.HIGH}</macro>
          <value>95</value>
          <description>Memory utilization high threshold (%).</description>
        </macro>
        <macro>
          <macro>{$GPU.UTIL.STUCK.HIGH}</macro>
          <value>99</value>
          <description>GPU utilization 'stuck high' threshold (%).</description>
        </macro>
        <macro>
          <macro>{$GPU.UTIL.STUCK.DUR}</macro>
          <value>10m</value>
          <description>Duration for utilization 'stuck high' check.</description>
        </macro>
      </macros>

      <discovery_rules>
        <discovery_rule>
          <name>NVIDIA GPU discovery</name>
          <type>ZABBIX_ACTIVE</type>
          <key>nvidia.discovery</key>
          <delay>1h</delay>
          <lifetime>7d</lifetime>
          <description>Discovers GPUs using nvidia.discovery JSON.</description>

          <item_prototypes>
            <item_prototype>
              <name>GPU {#GPUINDEX}: Utilization</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},util]</key>
              <delay>30s</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>%</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Temperature</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},temp]</key>
              <delay>30s</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>°C</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Memory used</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},mem.used]</key>
              <delay>30s</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>MiB</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Memory total</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},mem.total]</key>
              <delay>1h</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>MiB</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Memory utilization</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},mem.util]</key>
              <delay>30s</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>%</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Power draw</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},power]</key>
              <delay>30s</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>W</units>
            </item_prototype>

            <item_prototype>
              <name>GPU {#GPUINDEX}: Power limit</name>
              <type>ZABBIX_ACTIVE</type>
              <key>nvidia.metric[{#GPUINDEX},power.limit]</key>
              <delay>1h</delay>
              <history>7d</history>
              <trends>365d</trends>
              <value_type>FLOAT</value_type>
              <units>W</units>
            </item_prototype>
          </item_prototypes>

          <trigger_prototypes>
            <!-- Health: missing data -->
            <trigger_prototype>
              <name>GPU {#GPUINDEX}: No data (nvidia-smi/agent script failing)</name>
              <severity>HIGH</severity>
              <expression>nodata(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.NODATA})=1</expression>
              <description>No GPU utilization data received within {$GPU.NODATA}. This usually means nvidia-smi failed or the UserParameter script is failing.</description>
            </trigger_prototype>

            <!-- Health: temperature -->
            <trigger_prototype>
              <name>GPU {#GPUINDEX}: Temperature warning (&gt; {$GPU.TEMP.WARN}°C)</name>
              <severity>WARNING</severity>
              <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])&gt;{$GPU.TEMP.WARN}</expression>
            </trigger_prototype>

            <trigger_prototype>
              <name>GPU {#GPUINDEX}: Temperature HIGH (&gt; {$GPU.TEMP.HIGH}°C)</name>
              <severity>HIGH</severity>
              <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])&gt;{$GPU.TEMP.HIGH}</expression>
            </trigger_prototype>

            <!-- Health: power draw -->
            <trigger_prototype>
              <name>GPU {#GPUINDEX}: Power draw HIGH (&gt; {$GPU.POWER.HIGH}W)</name>
              <severity>HIGH</severity>
              <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},power])&gt;{$GPU.POWER.HIGH}</expression>
              <description>Power draw exceeds configured threshold. For Tesla T4, typical cap is ~70W.</description>
            </trigger_prototype>

            <!-- Health: memory util high -->
            <trigger_prototype>
              <name>GPU {#GPUINDEX}: Memory utilization HIGH (&gt; {$GPU.MEMUTIL.HIGH}%)</name>
              <severity>WARNING</severity>
              <expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},mem.util])&gt;{$GPU.MEMUTIL.HIGH}</expression>
            </trigger_prototype>

            <!-- Health: utilization stuck high -->
            <trigger_prototype>
              <name>GPU {#GPUINDEX}: Utilization stuck high (&gt; {$GPU.UTIL.STUCK.HIGH}% for {$GPU.UTIL.STUCK.DUR})</name>
              <severity>WARNING</severity>
              <expression>min(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.UTIL.STUCK.DUR})&gt;={$GPU.UTIL.STUCK.HIGH}</expression>
              <description>Helps catch runaway processes. Adjust or disable if this is expected.</description>
            </trigger_prototype>
          </trigger_prototypes>
        </discovery_rule>
      </discovery_rules>
    </template>
  </templates>
 </zabbix_export>
	<?xml version="1.0" encoding="UTF-8"?>
	<zabbix_export>
	<version>7.4</version>
	<template_groups>
	<template_group>
	<name>Templates/Hardware</name>
	</template_group>
	</template_groups>

	<templates>
	<template>
	<template>Template NVIDIA GPU (nvidia-smi UserParameter)</template>
	<name>Template NVIDIA GPU (nvidia-smi UserParameter)</name>
	<groups>
	<group>
	<name>Templates/Hardware</name>
	</group>
	</groups>
	<description>Monitors NVIDIA GPUs via Zabbix agent UserParameters calling nvidia-smi.
	Requires keys:
	- nvidia.discovery
	- nvidia.metric[INDEX,metric]</description>

	<macros>
	<macro>
	<macro>{$GPU.NODATA}</macro>
	<value>10m</value>
	<description>Alert if GPU metrics stop arriving for this long.</description>
	</macro>
	<macro>
	<macro>{$GPU.TEMP.WARN}</macro>
	<value>80</value>
	<description>Temperature warning threshold (°C).</description>
	</macro>
	<macro>
	<macro>{$GPU.TEMP.HIGH}</macro>
	<value>85</value>
	<description>Temperature high/critical threshold (°C).</description>
	</macro>
	<macro>
	<macro>{$GPU.POWER.HIGH}</macro>
	<value>70</value>
	<description>Power draw high/critical threshold (W). Tesla T4 is typically 70W cap.</description>
	</macro>
	<macro>
	<macro>{$GPU.MEMUTIL.HIGH}</macro>
	<value>95</value>
	<description>Memory utilization high threshold (%).</description>
	</macro>
	<macro>
	<macro>{$GPU.UTIL.STUCK.HIGH}</macro>
	<value>99</value>
	<description>GPU utilization 'stuck high' threshold (%).</description>
	</macro>
	<macro>
	<macro>{$GPU.UTIL.STUCK.DUR}</macro>
	<value>10m</value>
	<description>Duration for utilization 'stuck high' check.</description>
	</macro>
	</macros>

	<discovery_rules>
	<discovery_rule>
	<name>NVIDIA GPU discovery</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.discovery</key>
	<delay>1h</delay>
	<lifetime>7d</lifetime>
	<description>Discovers GPUs using nvidia.discovery JSON.</description>

	<item_prototypes>
	<item_prototype>
	<name>GPU {#GPUINDEX}: Utilization</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},util]</key>
	<delay>30s</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>%</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Temperature</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},temp]</key>
	<delay>30s</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>°C</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Memory used</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},mem.used]</key>
	<delay>30s</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>MiB</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Memory total</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},mem.total]</key>
	<delay>1h</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>MiB</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Memory utilization</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},mem.util]</key>
	<delay>30s</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>%</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Power draw</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},power]</key>
	<delay>30s</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>W</units>
	</item_prototype>

	<item_prototype>
	<name>GPU {#GPUINDEX}: Power limit</name>
	<type>ZABBIX_ACTIVE</type>
	<key>nvidia.metric[{#GPUINDEX},power.limit]</key>
	<delay>1h</delay>
	<history>7d</history>
	<trends>365d</trends>
	<value_type>FLOAT</value_type>
	<units>W</units>
	</item_prototype>
	</item_prototypes>

	<trigger_prototypes>
	<!-- Health: missing data -->
	<trigger_prototype>
	<name>GPU {#GPUINDEX}: No data (nvidia-smi/agent script failing)</name>
	<severity>HIGH</severity>
	<expression>nodata(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.NODATA})=1</expression>
	<description>No GPU utilization data received within {$GPU.NODATA}. This usually means nvidia-smi failed or the UserParameter script is failing.</description>
	</trigger_prototype>

	<!-- Health: temperature -->
	<trigger_prototype>
	<name>GPU {#GPUINDEX}: Temperature warning (> {$GPU.TEMP.WARN}°C)</name>
	<severity>WARNING</severity>
	<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])>{$GPU.TEMP.WARN}</expression>
	</trigger_prototype>

	<trigger_prototype>
	<name>GPU {#GPUINDEX}: Temperature HIGH (> {$GPU.TEMP.HIGH}°C)</name>
	<severity>HIGH</severity>
	<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},temp])>{$GPU.TEMP.HIGH}</expression>
	</trigger_prototype>

	<!-- Health: power draw -->
	<trigger_prototype>
	<name>GPU {#GPUINDEX}: Power draw HIGH (> {$GPU.POWER.HIGH}W)</name>
	<severity>HIGH</severity>
	<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},power])>{$GPU.POWER.HIGH}</expression>
	<description>Power draw exceeds configured threshold. For Tesla T4, typical cap is ~70W.</description>
	</trigger_prototype>

	<!-- Health: memory util high -->
	<trigger_prototype>
	<name>GPU {#GPUINDEX}: Memory utilization HIGH (> {$GPU.MEMUTIL.HIGH}%)</name>
	<severity>WARNING</severity>
	<expression>last(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},mem.util])>{$GPU.MEMUTIL.HIGH}</expression>
	</trigger_prototype>

	<!-- Health: utilization stuck high -->
	<trigger_prototype>
	<name>GPU {#GPUINDEX}: Utilization stuck high (> {$GPU.UTIL.STUCK.HIGH}% for {$GPU.UTIL.STUCK.DUR})</name>
	<severity>WARNING</severity>
	<expression>min(/Template NVIDIA GPU (nvidia-smi UserParameter)/nvidia.metric[{#GPUINDEX},util],{$GPU.UTIL.STUCK.DUR})>={$GPU.UTIL.STUCK.HIGH}</expression>
	<description>Helps catch runaway processes. Adjust or disable if this is expected.</description>
	</trigger_prototype>
	</trigger_prototypes>
	</discovery_rule>
	</discovery_rules>
	</template>
	</templates>
	</zabbix_export>
No results found