jpic/gist:715f46ba7c32228db9f0d97ce90dca68

jpic · 2025-11-20T11:28:17Z

import textwrap
from typing import Optional
import httpx
import time
import asyncio
from django.db import models
from your_app.models import HostName
from your_framework import Tool, Parameter


async def get_thanos_url_for_hostname(hostname_hint: str) -> str:
    '''Resolve any hostname hint → correct Thanos URL using async Django ORM.'''
    host_obj = await HostName.objects.filter(
        models.Q(hostname__iexact=hostname_hint) |
        models.Q(fqdn__iexact=hostname_hint)
    ).select_related('server__environment').afirst()

    if not host_obj:
        raise ValueError(f"Host '{hostname_hint}' not found in inventory")

    if not host_obj.server or not host_obj.server.environment:
        raise ValueError(f"Host '{hostname_hint}' has no server or environment configured")

    url = host_obj.server.environment.thanos_query_server
    if not url:
        raise ValueError(f"Environment for '{hostname_hint}' has no Thanos URL")

    return url.rstrip('/')


class ListPrometheusInstances(Tool):
    description = textwrap.dedent('''
        List currently scraped instances in the correct environment.
        Handles:
        • node_exporter → instance = http://primary-fqdn:9100
        • cassandra_exporter → instance = cass-node-name:8080 + cluster/rack labels
    ''')

    hostname_hint = Parameter(
        type='string',
        description='Any hostname/FQDN you know (e.g. db01, cass-stg-07.example.com)',
        required=True,
    )
    job = Parameter(
        type='string',
        description="Optional filter: 'node_exporter' or 'cassandra_exporter'",
        required=False,
    )

    async def run(self, conversation, hostname_hint: str, job: Optional[str] = None) -> str:
        try:
            thanos_base = await get_thanos_url_for_hostname(hostname_hint)
        except Exception as e:
            return f"[list_prometheus_instances] ERROR: {e}"

        env_name = 'PRODUCTION' if 'prod' in thanos_base.lower() else 'STAGING'

        # Build correct match expression
        if job == 'node_exporter':
            match_query = 'up{job="node_exporter"}'
        elif job == 'cassandra_exporter':
            match_query = 'up{job="cassandra_exporter"}'
        else:
            match_query = 'up'

        async with httpx.AsyncClient(timeout=20.0) as client:
            try:
                resp = await client.get(
                    f"{thanos_base}/api/v1/query",
                    params={'query': match_query},
                    timeout=20,
                )
                resp.raise_for_status()
                results = resp.json()['data']['result']
            except Exception as e:
                return f"[list_prometheus_instances] {env_name} — Query failed: {e}"

        if not results:
            return f"[list_prometheus_instances] {env_name} — No instances found."

        lines = [
            f"[list_prometheus_instances] {env_name} (resolved via {hostname_hint})",
            f"    Found {len(results)} instances\n"
        ]

        for r in sorted(results, key=lambda x: x['metric'].get('instance', '')):
            m = r['metric']
            instance_raw = m.get('instance', '?')
            job_name = m.get('job', 'unknown')

            if job_name == 'node_exporter':
                # http://host.prod.example.com:9100 → host.prod.example.com
                clean = instance_raw.replace('http://', '').split(':')[0]
                display = f"{clean:<48}  node_exporter"
            else:
                # Cassandra exporter
                cass_node = instance_raw.split(':')[0]
                cluster = m.get('cluster', '?')
                rack = m.get('rack', '?')
                display = f"{cass_node:<30}  cassandra_exporter  cluster={cluster} rack={rack}"

            lines.append(f"    {display}")

        return '\n'.join(lines)


class QueryPrometheus(Tool):
    description = textwrap.dedent('''
        Run any PromQL query in the correct environment.
        Works perfectly with both:
        • instance="http://primary-fqdn:9100" (node_exporter)
        • instance="cassandra-node:8080" + cluster/rack (cassandra_exporter)
    ''')

    hostname_hint = Parameter(
        type='string',
        description='Any known hostname in the target environment',
        required=True,
    )
    query = Parameter(
        type='string',
        description='Exact PromQL query — use instance labels from list_prometheus_instances',
        required=True,
    )
    time_range_minutes = Parameter(
        type='integer',
        description='For range queries: look back N minutes',
        required=False,
    )

    async def run(self, conversation, hostname_hint: str, query: str,
                  time_range_minutes: Optional[int] = None) -> str:
        try:
            thanos_base = await get_thanos_url_for_hostname(hostname_hint)
        except Exception as e:
            return f"[query_prometheus] ERROR: {e}"

        env_name = 'PRODUCTION' if 'prod' in thanos_base.lower() else 'STAGING'

        async with httpx.AsyncClient(timeout=45.0) as client:
            try:
                if time_range_minutes and time_range_minutes > 0:
                    end = int(time.time())
                    start = end - time_range_minutes * 60
                    resp = await client.get(
                        f"{thanos_base}/api/v1/query_range",
                        params={
                            'query': query,
                            'start': start,
                            'end': end,
                            'step': '60s',
                        },
                    )
                else:
                    resp = await client.get(
                        f"{thanos_base}/api/v1/query",
                        params={'query': query},
                    )
                resp.raise_for_status()
                data = resp.json()['data']
            except Exception as e:
                return f"[query_prometheus] {env_name} — Request failed: {e}"

        if not data.get('result'):
            return f"[query_prometheus] {env_name} — No results\nQuery: {query}"

        lines = [
            f"[query_prometheus] {env_name} (via {hostname_hint})",
            f"Query: {query}",
        ]
        if time_range_minutes:
            lines.append(f"Range: last {time_range_minutes} min")

        for series in data['result'][:50]:
            m = series['metric']
            inst_raw = m.get('instance', m.get('job', 'unknown'))
            job_name = m.get('job', 'unknown')

            if job_name == 'node_exporter':
                inst_display = inst_raw.replace('http://', '').split(':')[0]
                extra = ''
            else:
                inst_display = inst_raw.split(':')[0]
                cluster = m.get('cluster', '?')
                rack = m.get('rack', '?')
                extra = f"  cluster={cluster} rack={rack}"

            labels = ' '.join(
                f"{k}={v}"
                for k, v in m.items()
                if k not in {'__name__', 'instance', 'job', 'cluster', 'rack'}
            )

            lines.append(f"\n  • {inst_display}{extra}  {labels}".strip())

            if 'value' in series:
                lines.append(f"    → {series['value'][1]}")
            else:
                for ts, val in series['values'][-5:]:
                    try:
                        tm_resp = await client.get(
                            f"{thanos_base}/api/v1/format_time",
                            params={'time': ts},
                            timeout=5,
                        )
                        tm = tm_resp.text.strip() if tm_resp.ok else '?'
                    except Exception:
                        tm = '?'
                    lines.append(f"    [{tm}] {val}")

        if len(data['result']) > 50:
            lines.append(f"\n... (truncated, {len(data['result'])} total series)")

        return '\n'.join(lines)

jpic · 2025-11-20T11:53:05Z

class ListPrometheusInstances(Tool):
    description = textwrap.dedent('''
        List currently scraped instances in the correct environment.
        Your setup:
        • job="node"       → node_exporter (instance = primary-hostname:9100)
        • job="nodetool"   → centralized custom exporter (instance is useless, use hostname= label instead)
    ''')

    hostname_hint = Parameter(
        type='string',
        description='Any hostname you know in the target environment (e.g. web01, cass07, db-stg-03)',
        required=True,
    )
    job = Parameter(
        type='string',
        description="Filter by job: 'node' or 'nodetool'",
        required=False,
    )

    async def run(self, conversation, hostname_hint: str, job: Optional[str] = None) -> str:
        try:
            thanos_base = await get_thanos_url_for_hostname(hostname_hint)
        except Exception as e:
            return f"[list_prometheus_instances] ERROR: {e}"

        env_name = 'PRODUCTION' if 'prod' in thanos_base.lower() else 'STAGING'

        # Build correct query for each job
        if job == 'node':
            promql = 'up{job="node"}'
        elif job == 'nodetool':
            promql = 'up{job="nodetool"}'
        else:
            promql = 'up{job=~"node|nodetool"}'

        async with httpx.AsyncClient(timeout=20.0) as client:
            try:
                resp = await client.get(
                    f"{thanos_base}/api/v1/query",
                    params={'query': promql},
                    timeout=20,
                )
                resp.raise_for_status()
                results = resp.json()['data']['result']
            except Exception as e:
                return f"[list_prometheus_instances] {env_name} — Query failed: {e}"

        if not results:
            return f"[list_prometheus_instances] {env_name} — No instances found."

        lines = [
            f"[list_prometheus_instances] {env_name} (via {hostname_hint})",
            f"    Found {len(results)} instances\n"
        ]

        for r in sorted(results, key=lambda x: x['metric'].get('instance', '') or x['metric'].get('hostname', '')):
            m = r['metric']
            job_name = m.get('job', 'unknown')

            if job_name == 'node':
                instance = m.get('instance', '?')
                clean_host = instance.split(':')[0]  # removes :9100
                display = f"{clean_host:<45}  node_exporter"
            else:  # job="nodetool"
                hostname_label = m.get('hostname', '?')
                useless_instance = m.get('instance', '?')
                display = f"{hostname_label:<30}  nodetool_exporter  (instance={useless_instance} → ignore)"

            lines.append(f"    {display}")

        return '\n'.join(lines)

### METRICS RULES — OBEY OR FAIL

You have exactly two exporters:

1. Node exporter  
   job="node"  
   instance = "primary-hostname:9100" → use this exactly

2. Centralized nodetool exporter  
   job="nodetool"  
   instance = useless (always the same) → IGNORE IT  
   hostname = "cass-prod-07" (real Cassandra node) → use this instead

### WORKFLOW (never skip a step)
1. Always start with:
   list_prometheus_instances hostname_hint=<any-host> job=<node|nodetool>

2. Copy the exact instance (for node) or hostname label (for nodetool) from the output.

3. Write queries exactly like this:

   # System / host
   node_memory_MemAvailable_bytes{job="node", instance="web-prod-01.example.com:9100"}
   100 * (1 - node_memory_MemAvailable_bytes{job="node", instance="db01:9100"} / node_memory_MemTotal_bytes{job="node", instance="db01:9100"})

   # Cassandra
   cassandra_heap_used_bytes{job="nodetool", hostname="cass-prod-07"}
   rate(cassandra_gc_duration_seconds_count{job="nodetool", hostname="cass-stg-03"}[5m])

Never guess labels. Never use the wrong job.  
If unsure → run list_prometheus_instances first.

Do it right → you’re faster than any human.  
Do it wrong → you’re useless.

	read_file_slice() {
	local path="$1"
	local start_byte="${2:-0}"
	local length="${3:-32768}"
	local show_hex="${4:-false}"

	# Safety caps
	(( length > 131072 )) && length=131072
	(( start_byte < 0 )) && start_byte=0

	# Resolve path
	path=$(realpath "$path" 2>/dev/null \|\| printf '%s\n' "$path")

	# Header
	printf '[read_file_slice] %s\n' "$path"
	printf '[Requested bytes: %d–%d (%d bytes)]\n\n' \
	"$start_byte" "$((start_byte + length - 1))" "$length"

	# ── Extract the exact byte range ─────────────────────────────────────
	if [[ "$path" == *.gz ]]; then
	block_size=1048576
	approx_block=$(( start_byte / block_size ))
	offset_in_block=$(( start_byte % block_size ))

	(dd if="$path" bs="$block_size" skip="$approx_block" count=8 status=none 2>/dev/null \|\| true) \|
	gzip -dc 2>/dev/null \|
	dd bs=1 skip="$offset_in_block" count="$length" status=none 2>/dev/null
	else
	dd if="$path" bs=1 skip="$start_byte" count="$length" status=none 2>/dev/null
	fi \| {

	# Read everything once (preserves NUL bytes!)
	IFS= read -r -d '' data \|\| true

	if [ "$show_hex" = true ]; then
	printf '[show_hex forced → hexdump]\n'
	printf '%s' "$data" \| hexdump -ve '1/1 "%.02X "' -e '16/1 "%_c" "\n"'
	return
	fi

	# Detect binary: >15% non-printable/control chars in first 32KB
	nonprint=$(printf '%s' "$data" \| head -c 32768 \|
	tr -d -c '[\001-\010\016-\037\177-\377]' \| wc -c)

	if (( nonprint * 100 <= 32768 * 15 )); then
	# Text → raw output
	printf '%s' "$data"
	else
	# Binary → hexdump
	printf '[BINARY DETECTED → hexdump of requested range]\n'
	printf '%s' "$data" \| hexdump -ve '1/1 "%.02X "' -e '16/1 "%_c" "\n"'
	fi
	}
	}

jpic/gist:715f46ba7c32228db9f0d97ce90dca68

Select an option

No results found

Select an option

No results found

jpic commented Nov 20, 2025

Uh oh!

jpic commented Nov 20, 2025 •

edited

Loading

Uh oh!

jpic/gist:715f46ba7c32228db9f0d97ce90dca68

jpic commented Nov 20, 2025

Uh oh!

jpic commented Nov 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jpic commented Nov 20, 2025 •

edited

Loading