Skip to content

Instantly share code, notes, and snippets.

@popmonkey
Created August 23, 2025 15:21
Show Gist options
  • Select an option

  • Save popmonkey/c06e491e5e8c45274ae3432c06ddcf46 to your computer and use it in GitHub Desktop.

Select an option

Save popmonkey/c06e491e5e8c45274ae3432c06ddcf46 to your computer and use it in GitHub Desktop.
file system inventory/summary
#!/bin/bash
# ==============================================================================
# Filesystem Inventory Processor
#
# Description:
# A self-contained script to process the output of `find ... -ls`.
# It accepts input from a file or stdin, displays a real-time line count
# on the terminal, and writes a clean, color-free report to a specified
# output file.
#
# Usage:
# # From a file
# ./inventory_processor.sh /path/to/find_output.txt /path/to/report.txt
#
# # From a pipe (e.g., local scan)
# sudo find / -ls | ./inventory_processor.sh - /path/to/report.txt
#
# # From a remote pipe
# ssh user@host "find . -ls" | ./inventory_processor.sh - /path/to/report.txt
#
# Author:
# popmonkey & Gemini
# ==============================================================================
# --- Embedded AWK Script ---
# The core processing logic is contained within this AWK script.
# It is passed to the awk command via a variable.
read -r -d '' AWK_SCRIPT <<'EOF'
BEGIN {
# --- Configuration ---
SIZE_THRESHOLD_GB = 1
FILE_COUNT_THRESHOLD = 1000
SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024
# --- ANSI Color Codes ---
CYAN = "\033[0;36m"
GREEN = "\033[0;32m"
WHITE = "\033[1;37m"
NC = "\033[0m"
print CYAN "==> Processing file list..." NC > "/dev/stderr"
}
{
# Print a progress update to stderr every 10,000 lines.
# The carriage return `\r` keeps the output on a single line.
if (NR % 10000 == 0) {
printf "%s==> Processed %'d lines...\r%s", CYAN, NR, NC > "/dev/stderr"
}
file_size = $7
full_path = $11
if (substr($3, 1, 1) == "-") {
if (file_size > SIZE_THRESHOLD_BYTES) {
large_files[full_path] = file_size
}
parent_dir = full_path
sub(/\/[^\/]*$/, "", parent_dir)
if (parent_dir == "") { parent_dir = "/" }
dir_file_counts[parent_dir]++
dir_sizes[parent_dir] += file_size
}
}
END {
# Clear the progress line and print the final count.
printf "%s==> Processed %'d total lines. Calculating sizes...%s\n", CYAN, NR, NC > "/dev/stderr"
TMP_FILE = "/tmp/awk_inv_sort." PROCINFO["pid"]
for (dir in dir_sizes) {
print dir > TMP_FILE
}
close(TMP_FILE)
cmd = "sort -r " TMP_FILE
while ((cmd | getline sorted_dir) > 0) {
parent_dir = sorted_dir
sub(/\/[^\/]*$/, "", parent_dir)
if (parent_dir != "" && parent_dir != sorted_dir) {
dir_sizes[parent_dir] += dir_sizes[sorted_dir]
}
}
close(cmd)
system("rm " TMP_FILE)
print CYAN "==> Generating summary..." NC > "/dev/stderr"
print "\n" GREEN "--- Directories with total size > " SIZE_THRESHOLD_GB "GB (sorted largest to smallest) ---" NC
cmd = "sort -k1,1nr"
for (dir in dir_sizes) {
if (dir_sizes[dir] > SIZE_THRESHOLD_BYTES) {
gigs = dir_sizes[dir] / (1024*1024*1024)
printf "%.2f GB\t%s\n", gigs, dir | cmd
}
}
close(cmd)
print "\n" GREEN "--- Directories with > " FILE_COUNT_THRESHOLD " files (direct children only) ---" NC
cmd = "sort -k1,1nr"
for (dir in dir_file_counts) {
if (dir_file_counts[dir] >= FILE_COUNT_THRESHOLD) {
printf "%d files\t- %s\n", dir_file_counts[dir], dir | cmd
}
}
close(cmd)
print "\n" GREEN "--- Individual files > " SIZE_THRESHOLD_GB "GB (sorted largest to smallest) ---" NC
cmd = "sort -k1,1nr"
for (file in large_files) {
gigs = large_files[file] / (1024*1024*1024)
printf "%.2f GB\t%s\n", gigs, file | cmd
}
close(cmd)
}
EOF
# --- Main Script Logic ---
# 1. Validate Input
if [ "$#" -ne 2 ]; then
echo "Error: Incorrect number of arguments." >&2
echo "Usage: $0 <input_file_or_dash> <output_file>" >&2
exit 1
fi
INPUT_SOURCE="$1"
OUTPUT_FILE="$2"
# Use /dev/stdin if the input source is a dash '-'.
if [ "$INPUT_SOURCE" = "-" ]; then
INPUT_SOURCE="/dev/stdin"
elif [ ! -f "$INPUT_SOURCE" ]; then
echo "Error: Input file not found at '$INPUT_SOURCE'" >&2
exit 1
fi
# 2. Execute and Process
# Process the data:
# - `cat` reads from the specified source (file or stdin).
# - The `awk` command runs our main inventory logic, printing progress to stderr.
# - The `sed` command strips color codes from stdout for the output file.
cat "$INPUT_SOURCE" | awk "$AWK_SCRIPT" | sed -r 's/\x1b\[[0-9;]*m//g' > "$OUTPUT_FILE"
# 3. Final Message
echo -e "\033[0;36m==> Inventory complete! Report saved to '$OUTPUT_FILE'\033[0m" >&2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment