Skip to content

Instantly share code, notes, and snippets.

@timm
Last active November 30, 2025 22:26
Show Gist options
  • Select an option

  • Save timm/9d85996f6750f3a828609c4140c0267c to your computer and use it in GitHub Desktop.

Select an option

Save timm/9d85996f6750f3a828609c4140c0267c to your computer and use it in GitHub Desktop.
Lightweight AI : a little scripting goes a long way
#!/usr/bin/env gawk -f
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
BEGIN { FS=" " }
{ seen($1,$2) }
END { report() }
function seen(want,got, kl) {
if(!(want in cf)) { cf[want]["tn"] = total }
if(!(got in cf)) { cf[got]["tn"] = total }
total++
for(kl in cf) {
if(kl==want) { cf[kl]["tp"]+=(got==want); cf[kl]["fn"]+=(got!=want) }
else { cf[kl]["fp"]+=(got==kl); cf[kl]["tn"]+=(got!=kl) }}}
function report( kl,a,b,c,d) {
print " N TN FN FP TP PD PREC PF ACC LABEL"
for(kl in cf) {
a = cf[kl]["tn"]; b = cf[kl]["fn"]
c = cf[kl]["fp"]; d = cf[kl]["tp"]
printf "%5d %5d %5d %5d %5d %5d %5d %5d %5d %-20s\n",
d+b, a, b, c, d,
div(d,d+b), div(d,c+d), div(c,c+a), div(d+a,a+b+c+d), kl | "sort -n" }}
function div(a,b) { return int(100*a/(b+1E-32)) }

abcd: show classification metrics

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

Usage

chmod +x abcd && export PATH=$PATH:$(pwd)  # install
someprogram data.txt | abcd                 # use

Input Format

actual predicted
actual predicted
etc

Example

Uses diabetes.csv

make eg-abcd

--------- eg-abcd
    N    TN    FN    FP    TP    PD  PREC    PF   ACC LABEL
  255   383    83   110   172    67    60    22    74 tested_positive
  493   172   110    83   383    77    82    32    74 tested_negative

How It Works

  1. Processes each line: Updates TP, TN, FP, FN counts for all classes
  2. Calculates metrics: N (total instances), TN/FN/FP/TP (confusion matrix), PD (detection rate), PREC (precision), PF (false alarm rate), ACC (accuracy) - all as percentages
#!/usr/bin/env gawk -f
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
BEGIN { FS=","}
{ gsub(/[ \t\r]/,"") }
NR==1 { head(); print $0 ",Y-" }
NR>1 { body() }
END { tail() }
function head( i) {
for(i=1;i<=NF;i++) {
if($i ~ /[\+\-]$/) {
hi[i] = -(lo[i]=1e32)
y[i] = $i ~ /\+$/ }}}
function body( i,v) {
for(i=1;i<=NF;i++) {
if(i in y) {
$i += 0
if ($i > hi[i]) hi[i] = $i
if ($i < lo[i]) lo[i] = $i }
row[NR-1][i] = $i }}
function norm(i,x) { return (x - lo[i]) / (hi[i] - lo[i] + 1e-32) }
function dist(row, d,n) {
for(i in y) { d += (norm(i,row[i]) - y[i])^2; n++ }
return (d / (n+1e-32))^.5 }
function tail( r,i,s,sep,com) {
for(r in row) {
com = com ? com : "sort -t, -nk"(1+length(row[r]))
s = sep=""
for(i=1;i<=NF;i++) {
s = s sep row[r][i]; sep = "," }
print s "," dist(row[r]) | com }}
#!/usr/bin/env gawk -f
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
BEGIN { FS=","
BINS=7; CONVFMT = "%.2f" }
{ gsub(/[ \t\r]/,"") }
NR==1 { head(); print }
NR>1 { body() }
END { tail() }
function head( i,b) {
for(i=1;i<=NF;i++)
if (($i ~ /^[A-Z]/) && ($i !~ /[-+!]$/)) {
hi[i] = -(lo[i] = 1E32)
for(b=0; b<=BINS; b++) bmin[i][b] = 1E32 }}
function body( i) {
for(i=1;i<=NF;i++) row[NR-1][i] = seen(i,$i) }
function tail( r,i,s,sep) {
for(r in row) for(i=1;i<=NF;i++) bin(i,row[r][i])
for(r in row) {
s=sep=""
for(i=1;i<=NF;i++) {
s = s sep bin(i, row[r][i]); sep = "," }
print s }}
function seen(i,v, d) {
if ((v!="?") && (i in hi)) {
v += 0
cnt[i] += 1
d = v - mu[i]
mu[i] += d/cnt[i]
m2[i] += d*(v-mu[i])
sd[i] = cnt[i] < 2 ? 0 : sqrt(m2[i]/(cnt[i]-1)) }
return v }
function bin(i,v, b) {
if ((v!="?") && (i in hi)) {
b = int(BINS / (1 + exp(-1.704 * (v - mu[i]) / (sd[i] + 1E-32))))
if (v < bmin[i][b]) bmin[i][b] = v
v = bmin[i][b] }
return v }
#!/usr/bin/env lua
local help = [[
bins.lua : stochastic incremental XAI
(c) 2025, Tim Menzies, [email protected], mit-license.org
Options:
-h Show help.
-b bins=7 Number of bins for discretization.
-e era=30 Update model every `era` number of rows.
-r ruleMax=3 Max conditions in a rule.
-s seed=42 Random number seed.
-f file=../lua6/auto93.csv ]]
local function coerce(s)
if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end
math.randomseed(the.seed)
local DATA, NUM, SYM, COLS, clone, adds
local abs,exp,sqrt,log = math.abs, math.exp, math.sqrt, math.log
local max,rand,cos = math.max, math.random, math.cos
local fmt = string.format
local sort = function(t,f) table.sort(t,f); return t end
local lt = function(f) return function(a,b) return f(a) < f(b) end end
local cat = function(a) return "{".. table.concat(a," ") .."}" end
local function o(v, list,dict)
list = function(a, u)
for _,v in ipairs(a) do u[1+#u] = o(v) end; return cat(u) end
dict = function(d, u)
for k,v in pairs(d) do u[1+#u] = fmt(":%s %s", k, o(v)) end
return cat(sort(u)) end
return type(v) == "number" and fmt(v%1==0 and "%.0f" or "%.3f", v) or
type(v) ~= "table" and tostring(v) or (#v>0 and list or dict)(v,{}) end
local function s2a(s, a)
a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end
local function csv(file, src)
src = assert(io.open(file))
return function( s)
s = src:read()
if s then return s2a(s) else src:close() end end end
local function cut(a0,n, data)
local a1,a2 = {},{}
for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end
if data then return clone(data,a1),clone(data,a2) end
return a1,a2 end
function box_muller(mu,sd)
return mu + sd * sqrt(-2 * log(rand())) * cos(6.28 * rand()) end
-- ----------------------------------------------------------------------------
function DATA( src) return adds(src, {n=0,rows={},cols=nil}) end
function clone(i, src) return adds(src, DATA{i.cols.names}) end
function NUM(at,s)
return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0, bins={},
best=(tostring(s) or ""):find"+$" and 1 or 0} end
function SYM(at,s) return {at=at, of=s, n=0, has={}, bins={}} end
function COLS(row, t,x,y,all,col)
x,y,all = {},{},{}
for n,s in ipairs(row) do
col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
all[n] = col
if not s:match"X$" then
t = s:find"[+-]$" and y or x
t[1+#t] = col end end
return {all=all, x=x, y=y, names=row} end
local function add(i,v)
if v == "?" then return v end
i.n = i.n + 1
if i.has then i.has[v] = 1 + (i.has[v] or 0)
elseif i.mu then
local d = v - i.mu
i.mu = i.mu + d / i.n
i.m2 = i.m2 + d * (v - i.mu)
i.sd = i.n<2 and 0 or sqrt((i.m2/(i.n - 1)))
elseif i.rows then
if not i.cols then i.cols = COLS(v) else
for _,col in pairs(i.cols.all) do add(col, v[col.at]) end
i.rows[1 + #i.rows] = v end end
return v end
function adds(src, it)
it = it or NUM()
if type(src) == "string"
then for row in csv(src) do add(it,row) end
else for _,row in pairs(src or {}) do add(it,row) end end
return it end
local function norm(i,v)
return (i.has or v=="?") and v
or 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end
local function disty(i,row, d)
d=0; for _,y in pairs(i.cols.y) do d= d + (norm(y, row[y.at]) - y.best)^2 end
return sqrt(d/#i.cols.y) end
local function distys(i, rows, y)
y = function(row) return disty(i, row) end
return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end
local function two(data)
local train,test,start,todo,seen,best,rest,d
shuffle(data.rows)
train,test = cut(data.rows, data.n//2)
start,todo = cut(train, 4)
seen = clone(data, start)
best,rest = cut(distys(seen),2,data)
d = function(row,what) return distx(seen, row, mid(what)) end
for n,row in pairs(todo) do
if n>256 then break end
if d(row,best) < d(row,rest) then
add(seen, add(best, row))
if best.n > sqrt(seen.n) then
add(rest, sub(best, table.remove( distys(best)))) end end end
distys(best)
return {best=best, rest=rest, seen=seen, test=test,
model=function(row) return d(row,best) - d(row, rest) end} end
-- ----------------------------------------------------------------------------
local egs={}
egs["-h"] = function(_) print("\n"..help.."\n") end
egs["-s"] = function(n) math.randomseed(n); the.seed =n end
egs["--the"] = function(_) print(o(the)) end
egs["--csv"] = function(_) for row in csv(the.file) do print(o(row)) end end
egs["--num"] = function(_,num)
num=NUM()
for _=1,1000 do add(num, box_muller(10,5)) end
print(fmt("%.3f %.3f", num.mu, num.sd)) end
egs["--data"] = function(_)
for n,col in pairs(DATA(the.file).cols.x) do
print(n,o(col)) end end
egs["--disty"]= function(_, data,num)
data,t = DATA(the.file), {}
distys(data)
for n,row in pairs(data.rows) do t[n]=disty(data,row) end
print(o(t)) end
local function cli(d,funs)
for i,s in pairs(arg) do
if funs[s]
then funs[s](coerce(arg[i+1]))
else for k,_ in pairs(d) do
if k:sub(1,1)==s:sub(2) then d[k]=coerce(arg[i+1]) end end end end end
if arg[0]:find"bins.lua" then cli(the,egs) end

bins: discretize numeric columns

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

Usage

chmod +x bins && export PATH=$PATH:$(pwd)  # install
bins < data.csv > discretized.csv          # use

Input Format

CSV file where:

  • First row: headers (columns starting with uppercase letter are numeric, unless ending with -, +, or !)
  • Remaining rows: data (use ? for missing values)

Example

bins < auto.csv
# Numeric columns divided into 7 bins (0-6)
# Each value replaced by its bin's minimum value

How It Works

  1. Head pass: Identifies numeric columns (uppercase headers without -+! suffix)
  2. Body pass: Stores all rows, calculates running mean and standard deviation for numeric columns
  3. Tail pass:
    • Bins each numeric value using logistic function: bin = BINS / (1 + exp(-1.704 * (v - mean) / stddev))
    • Replaces values with their bin's minimum value
    • Outputs discretized CSV

Note: Uses Welford's algorithm for numerically stable mean/variance calculation. The constant 1.704 ensures the middle values fall into the middle bins.

#!/usr/bin/env bash
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
alias ls="\ls --color"
alias reload="source '$Here/ell' && echo ✅"
alias grep='grep --color=auto'
alias tree='tree -C'
export BASH_SILENCE_DEPRECATION_WARNING=1
export PATH="$Here:$PATH"
export HISTSIZE=10000
export HISTFILESIZE=20000
export HISTCONTROL=ignoredups:erasedups
Here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
branch() { git branch 2>/dev/null | awk '/^\*/ {print $2}'; }
dirty() { [[ -n $(git status -s 2>/dev/null) ]] && echo "*"; }
bold=$(tput bold) col0=$(tput sgr0) col1=$(tput setaf 6) col2=$(tput setaf 3)
PROMPT_COMMAND='PS1="${bold}${col1}$(basename "$(dirname "$PWD")")/$(basename "$PWD")${col0} ${col2}$(branch)$(dirty)${col0} ▶ "'
vi() {
nvim --clean \
--cmd "let g:netrw_banner=0 | let g:netrw_liststyle=3 | let g:netrw_browse_split=4 | let g:netrw_winsize=15" \
--cmd "set number relativenumber cursorline mouse=a clipboard=unnamedplus ignorecase smartcase" \
--cmd "set statusline=%#StatusLine#\ ▶\ %f\ %m%r%=%y\ ❖\ %l:%c\ ❖\ %p%%\ " \
--cmd "set expandtab tabstop=2 shiftwidth=2 splitright splitbelow" \
--cmd "set undofile undodir=~/.vim/undo" \
--cmd "nnoremap Q :quitall<CR>" \
--cmd "colorscheme zaibatsu" \
--cmd "set laststatus=2" \
"$@"
}
hi() {
clear
echo "${col1}"
cat<<'EOF'
██╗ ██╗ ████████╗ ██████╗
██║ ██║ ╚══██╔══╝ ██╔══██╗
██║ ██║ ██║ ██████╔╝
██║ ██║ ██║ ██╔══██╗
███████╗ ██║ ██║ ██║ ██║
╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝
(cause simple ain't stupid)
EOF
echo "${col0}"
}
inst() {
local m=""
for p in $1; do command -v "$p" &>/dev/null || m+="$p "; done
[ "$m" ] && case "$(uname -s)" in
Darwin*) brew install $m ;;
Linux*) sudo apt install -y $m ;;
MINGW*) winget install $m ;;
esac
}
# only run slow or verbose command at initial startup, not on reload
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
inst git nvim gawk tree
hi
exec bash --init-file "${BASH_SOURCE[0]}" -i
fi
#!/usr/bin/env gawk -f
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
BEGIN { SEED = 1234567891
ERA = 100 }
NR==1 { srand(SEED)
print "\n" $0; next }
{ a[rand()] = $0
if (length(a) > ERA) dump(a) }
END { dump(a) }
function dump(a, i) {
if (length(a) > 0) {
print ""
for(i in a) print a[i]
delete a }}
#!/usr/bin/env sh
figlet -W -f mini $@ | gawk '{print "-- " $0}'
# Lua.ssh --- Sheet definitions for Lua source code
# Copyright (c) 2014 Kenji Rikitake
# Copyright (c) 1999 Edward Arthur, Akim Demaille, Miguel Santana
#
#
# This file is NOT a part of a2ps.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to
# the Free Software Foundation, 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
## This style is derived from Edward Arthur's AWK Style Sheet
style Lua is
written by "Kenji Rikitake <[email protected]>"
version is 0.1
requires a2ps version 4.9.7
documentation is
"This style file is intended to support the Lua programming language source code."
end documentation
alphabets are
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_0"
case sensitive
keywords in Keyword are
and, false, ipairs, nil, not, or, pairs, rawequal, rawget,
rawlen, rawset, select, tonumber, tostring, true, type
end keywords
#keywords in Keyword_strong are
keywords in Keyword_strong are
assert, break, collectgarbage, do, dofile, else, elseif, "end",
error, for, function, getmetatable, goto, if, "in", local,
load, loadfile, next, pcall, print, repeat, require, return,
setmetatable, then, until, while, xpcall, _G, _VERSION
end keywords
keywords in Label_strong are
"^" function
end keywords
keywords in Comment are
self
end keywords
sequences are
"[[" Comment "]]",
"--" Comment,
C-string
end sequences
end style
# vim: ts=2 sw=2 noet
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
#------------------------------------------------------
K ?= 1#
M ?= 2#
BINS ?= 4#
ERA ?= 100#
DATA = ~/gits/moot/classify/diabetes.csv#
DATAG = ~/gits/moot/optimize/misc/auto93.csv#
G = gawk -f#
OK = gawk -f ok.awk -f#
#------------------------------------------------------
SHELL=/bin/bash
export PATH := $(CURDIR):$(PATH)
.SILENT:
.PHONY: help egs ok eg-nbc eg-abcd eg-soybean eg-globals pull push
help: ## show this help
@gawk 'BEGIN { FS=":.*?## ";c="\033[1;3"; r="\033[0m"; \
printf "\n%s6mmake%s [%s3moptions%s]:\n\n",c,r,c,r} \
NF==2 && $$1~/^[a-z0-9A-Z_-]+/{ \
printf " %s2m%-15s%s %s\n",c,$$1,r,$$2}' $(MAKEFILE_LIST)
#------------------------------------------------------
hi=@echo -e "\n--------- $@"
test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now
ok:; chmod +x nbc abcd bins
eg-bins: ok $(DATA) ## run naive bayes classifier
cat $(DATA) | bins BINS=$(BINS)
eg-nbc: ok $(DATA) ## run naive bayes classifier
$(hi); cat $(DATA) | bins BINS=$(BINS) | nbc K=$K M=$M | sort | uniq -c
eg-abcd: ok $(DATA) ## run classifier with confusion matrix
$(hi); cat $(DATA) | bins BINS=$(BINS) \
| nbc K=$K M=$M #| abcd
eg-soybean: ## run classifier on soybean
$(hi); $(MAKE) eg-abcd DATA=~/gits/timm/moot/classify/soybean.csv
eg-best: ok $(DATA) ## run naive bayes classifier
cat $(DATAG) | bins BINS=$(BINS) | best | column -s, -t | sed -n '1,5p;395,$$p'
eg-tree: ok $(DATA) ## run naive bayes classifier
cat $(DATAG) | bins BINS=$(BINS) | best | tree
eg-globals: ok $(DATA) ## run classifier with confusion matrix
$(hi); ( cat $(DATA) | $(OK) bins BINS=$(BINS) \
| $(OK) nbc K=$K M=$M | $(OK) abcd ) > /dev/null
# misc ------------------------------------------------
sh: ## run a customized shell
sh ell
pull: ## update from main
git pull
push: ## commit to main
git commit -am saving; git push; git status
~/tmp/%.pdf: %.lua
echo "pdf-ing $@ ... "
a2ps \
--file-align=virtual \
--line-numbers=1 \
--pro=color \
--lines-per-page=120 \
--pretty=lua.ssh \
--left-title="" \
--borders=no \
--right-footer="page %s. of %s#" \
--landscape \
--columns 3 \
-M letter \
-o - $^ | ps2pdf - $@
open $@
#!/usr/bin/env gawk -f
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
BEGIN { FS=","; WAIT=20; M=2; K=1 }
{ gsub(/[ \t\r]/,"") }
NR==1 { head() }
NR>1 { if (NR > WAIT+1) print $klass, predict()
train($klass) }
function head( i) {
for(i=1;i<=NF;i++) if ($i ~ /[!]$/) klass = i }
function train(actual, i) {
nk[actual]++
for(i=1;i<=NF;i++)
if (i != klass) freq[i][$i][actual]++ }
function predict( i,k,like,best,max) {
max = -1E32
for(k in nk) {
like = log((nk[k] + K) / (NR-1 + K*length(nk)))
for(i=1;i<=NF;i++)
if (i != klass)
like += log((freq[i][$i][k] + M/length(freq[i])) / (nk[k] + M));
if (like > max) {
max = like
best = k }}
return best ? best : k }

nbc: naive bayes classifier (for discrete data)

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

Usage

chmod +x nbc && export PATH=$PATH:$(pwd)  # install
bins < data.csv | nbc                     # use

Input Format

CSV file where:

  • First row: headers (column ending with ! is the class label)
  • Remaining rows: data (use ? for missing values)

Example

Uses diabetes.csv (binned)

make eg-nbc
--------- eg-nbc
 383 tested_negative tested_negative
 110 tested_negative tested_positive
  83 tested_positive tested_negative
 172 tested_positive tested_positive

How It Works

Note: Requires discretized data (pipe through bins first). Output can be piped to abcd for metrics.

  1. Warm-up: Trains on first 20 rows (WAIT=20) without predicting
  2. Incremental learning: For each subsequent row:
    • Predicts class using current model
    • Trains model on actual class
    • Outputs: actual_class predicted_class
  3. Classification: Selects class with max log-likelihood:
    • log P(class|features) = log P(class) + Σ log P(feature|class)
    • Uses m-estimate smoothing (M=2) for features and Laplace smoothing (K=1) for classes
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
END { rogues() }
function rogues( i,s,known) {
known = "^(NF|NR|FS|RS|RT|FNR|OFS|ORS|" \
"PREC|ARGC|ARGV|OFMT|LINT|FPAT|" \
"ERRNO|RSTART|ARGIND|" \
"SUBSEP|CONVFMT|ENVIRON|SYMTAB|FUNCTAB|PROCINFO|" \
"FILENAME|RLENGTH|BINMODE|" \
"IGNORECASE|FIELDWIDTHS|ROUNDMODE|TEXTDOMAIN)$"
for(i in SYMTAB) if (i !~ known) s = s " " i
if (s) print "?" s > "/dev/stderr" }

54 Programming Heuristics Illustrated

XXX data molde laiga. presentation layer seeprete to business model

XXX too many vars==> too many ideas XXX vars passed as a group to a subfunction ==> hidden objects

XXX yo yo is hatton's point (does oo syntax with the say we thing)

XXX test driven devlopment fig3 of https://ieeexplore.ieee.org/document/10352439 offers details on test

A comprehensive catalog of programming heuristics with examples from two.lua, Python, and larger systems.

These heuristics are represent some basic ideas about what is "good" SE.

But be careful how you use them. If you only use this knowledge to complain about 'bad code,' you become the bottleneck to other people' work. But if you use it to quietly refactor and fix the problems you see, you can earn a reputataion as a tech guru.


1. DRY (Don't Repeat Yourself)

In two.lua:

local help = [[
  -b  bins=7     Number of bins for discretization.
  -e  era=30     Update model every `era` number of rows.
  -r  ruleMax=3  Max conditions in a rule.
  -s  seed=42    Random number seed.]]

local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end

Single source of truth - help string defines both documentation AND defaults.

Python example:

# Bad: Define settings twice
DEFAULT_TIMEOUT = 30
parser.add_argument('--timeout', default=30, help='Timeout in seconds (default: 30)')

# Good: Define once
DEFAULTS = {'timeout': 30, 'retries': 3}
for key, val in DEFAULTS.items():
    parser.add_argument(f'--{key}', default=val, help=f'{key} (default: {val})')

Bigger system example: Django's database models define the schema once, then auto-generate admin interfaces, forms, migrations, and API serializers from that single definition.


2. Rule of Representation (Fold knowledge into data)

In two.lua:

col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
t = s:find"[+-]$" and y or x

Column names encode the schema - uppercase=numeric, +/-=goals, X=skip.

Python example:

# Encode validation rules in data
RULES = {
    'email': r'^[\w\.-]+@[\w\.-]+\.\w+$',
    'phone': r'^\d{3}-\d{3}-\d{4}$',
    'zip': r'^\d{5}$'
}

def validate(field, value):
    return re.match(RULES[field], value)

Bigger system example: Unix file permissions (rwxr-xr--) encode all access rules in 9 bits. The chmod program is simple because the data structure (permission bits) carries the knowledge.


3. Rule of Simplicity

In two.lua:

local function add(i,v,  inc)
  if v == "?" then return v end
  inc = inc or 1
  i.n = i.n + inc
  if i.mode then i.has[v] = inc + (i.has[v] or 0) 
  elseif i.mu then ...
  elseif i.rows then ...

One function handles NUM, SYM, and DATA with minimal branching.

Python example:

def median(numbers):
    sorted_nums = sorted(numbers)
    n = len(sorted_nums)
    mid = n // 2
    return sorted_nums[mid] if n % 2 else (sorted_nums[mid-1] + sorted_nums[mid]) / 2

Simple, direct algorithm. No complex edge case handling.

Bigger system example: Git's object model - everything is either a blob, tree, commit, or tag. Four types, composed simply, create an entire version control system.


4. Rule of Parsimony

In two.lua: Entire incremental XAI system in ~150 lines. Functions like:

local function cut(a0,n,  data)
  local a1,a2 = {},{}
  for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end
  if data then return clone(data,a1),clone(data,a2) end
  return a1,a2 end

Python example:

# Flask web server in 5 lines
from flask import Flask
app = Flask(__name__)

@app.route('/')
def hello(): return "Hello World!"

app.run()

Bigger system example: SQLite is ~150K lines for a full SQL database. Compare to Oracle's millions of lines. Parsimony wins for embedded use.


5. Rule of Clarity

In two.lua:

local lt = function(f) return function(a,b) return f(a) < f(b) end end
local cat = function(a) return "{".. table.concat(a," ") .."}" end

Names tell you exactly what they do: lt makes comparators, cat concatenates.

Python example:

def is_palindrome(text):
    cleaned = ''.join(c.lower() for c in text if c.isalnum())
    return cleaned == cleaned[::-1]

Name and implementation clearly express intent.

Bigger system example: Go's error handling - if err != nil { return err } is verbose but crystal clear. No hidden control flow like exceptions.


6. Rule of Economy

In two.lua:

local shuffle = function(t,    n)
  for m=#t,2,-1 do n=math.random(m); t[m],t[n]=t[n],t[m] end; return t end

Simple Fisher-Yates, not some optimized version. Programmer time > machine time.

Python example:

# Simple bubble sort for teaching
def bubble_sort(arr):
    for i in range(len(arr)):
        for j in range(len(arr)-1-i):
            if arr[j] > arr[j+1]:
                arr[j], arr[j+1] = arr[j+1], arr[j]

Not efficient, but easy to understand and maintain.

Bigger system example: Python's sorted() uses Timsort - it's complex internally, but the API is dead simple: sorted(items). Economy of interface, not implementation.


7. Rule of Generation

In two.lua:

local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end

Generate config parser from help string automatically.

Python example:

# Generate test cases from data
test_cases = [(i, i**2) for i in range(10)]

for input_val, expected in test_cases:
    assert square(input_val) == expected

Bigger system example: Protocol Buffers - define data schema once, generate serializers/deserializers for 20+ languages automatically.


8. Rule of Least Surprise

In two.lua:

local function add(i,v,  inc) ...
local function sub(i,v) return add(i,v,-1) end

add adds, sub subtracts. Does what the name says.

Python example:

class Stack:
    def push(self, item): self.items.append(item)
    def pop(self): return self.items.pop()
    def peek(self): return self.items[-1]

Method names are intuitive verbs that do exactly what you expect.

Bigger system example: jQuery's $('.class').hide() - method names are verbs that do exactly what they say. No surprises.


9. Separation of Concerns

In two.lua:

-- File I/O
local function csv(file,    src)
  src = assert(io.open(file))
  return function(    s)
    s = src:read()
    if s then return s2a(s) else src:close() end end end

-- String parsing
local function s2a(s,   a)
  a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end

-- Type conversion
local function coerce(s) 
  if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end

Each function handles one concern: I/O, parsing, or type conversion.

Python example:

# Separate data access, business logic, presentation
class UserRepository:
    def get_user(self, id): return db.query(User).get(id)

class UserService:
    def activate_user(self, user_id):
        user = self.repo.get_user(user_id)
        user.active = True
        return user

class UserView:
    def render(self, user): return f"<div>{user.name}</div>"

Bigger system example: MVC pattern - Models handle data, Views handle presentation, Controllers handle user input. Each concern separated.

Important sub-case.

  • don't pollute the presetation layer with model concerns
  • e.g. dont always print to console

Should you replace print with say?

Yes. Replacing raw print statements is a best practice in software engineering. Relying on raw prints creates "noise" that is difficult to turn off when you move from development to production.

You generally have two paths to solve this: the Custom Wrapper (your "say" idea) or Standard Logging.

Option 1: The "Say" Wrapper (Quick & Simple)

This is exactly what you proposed. It is a lightweight solution perfect for scripts, CLI tools, and small prototypes.

How it works: You wrap the print function in a conditional check based on a global flag.

# Configuration
VERBOSE = True  # Toggle this single flag to silence the app

def say(message):
    if VERBOSE:
        print(f"[INFO] {message}")

# Usage
say("Fetching data...")     # Prints only if VERBOSE is True
print("Fatal Error: 404")   # Always prints (standard print for actual results)

Pros:

  • Zero Dependencies: No libraries to import or configure.
  • Total Control: You can easily add timestamps or colors to the say function later.
  • Binary Silence: It is either ON or OFF.

Option 2: The Logging Library (The Industry Standard)

If your project is an application, a server, or a library used by others, you should skip the "say" function and use a logging library (like Python's logging, Java's Log4j, or JS winston).

How it works: Instead of a binary on/off switch, you use Levels. This allows you to silence "chatter" while keeping "warnings" active.

import logging

# Configuration: Set to INFO to hide DEBUG messages
logging.basicConfig(level=logging.INFO) 
logger = logging.getLogger()

# Usage
logger.debug("Variable x = 5")        # Silenced (Level is too low)
logger.info("Process started.")       # Printed
logger.warning("Disk space low.")     # Printed
logger.error("Database crashed!")     # Printed

Pros:

  • Granularity: You can silence trivia without silencing warnings.
  • Destinations: You can send logs to a file, the console, and an email simultaneously.
  • Standardization: Other developers already know how to use it.

Comparison Summary

Feature Raw print() Custom say() Wrapper Standard logging
Control None High (Custom code) High (Configuration)
Silencing Manual deletion Single Flag (All or Nothing) Leveled (Debug vs Error)
Complexity Low Low Medium
Best For Throwaway code Scripts & Tools Production Apps

Recommendation

  1. Refactor immediately: Move away from raw print statements for any informational text.
  2. Start with say: If you just need a "mute button," your proposed solution is perfect.
  3. Upgrade to logging later: If the project grows and you need to differentiate between "This is just info" and "This is a warning," switch to a logging library.

10. Single Responsibility Principle

In two.lua:

local function shuffle(t,    n)  -- Only shuffles
local function cut(a0,n,  data)  -- Only splits
local function norm(i,v)         -- Only normalizes

Python example:

def read_file(path):
    with open(path) as f:
        return f.read()

def parse_json(text):
    return json.loads(text)

def validate_config(config):
    assert 'version' in config
    return config

# Each does one thing
config = validate_config(parse_json(read_file('config.json')))

Bigger system example: Unix utilities - grep only searches, sort only sorts, uniq only deduplicates. Compose them with pipes.


11. Open/Closed Principle

In two.lua:

local function add(i,v,  inc)
  if v == "?" then return v end
  inc = inc or 1
  i.n = i.n + inc
  if i.mode then i.has[v] = inc + (i.has[v] or 0) 
  elseif i.mu then ...
  elseif i.rows then ...

Open for extension (new types via new fields), closed for modification (don't change add).

Python example:

class Shape:
    def area(self): raise NotImplementedError

class Circle(Shape):
    def __init__(self, r): self.r = r
    def area(self): return 3.14 * self.r ** 2

class Square(Shape):
    def __init__(self, s): self.s = s
    def area(self): return self.s ** 2

# Can add new shapes without modifying existing code
shapes = [Circle(5), Square(4)]
total_area = sum(s.area() for s in shapes)

Bigger system example: Linux kernel modules - add new device drivers without modifying kernel core. Open for extension, closed for modification.


12. Composition Over Inheritance

In two.lua:

function DATA(  src) return adds(src, {n=0,rows={},cols=nil}) end
function COLS(row,    t,x,y,all,col)
  x,y,all = {},{},{}
  for n,s in ipairs(row) do
    col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
    all[n] = col ...

DATA has COLS, COLS has array of NUM/SYM. Tables composed, not inherited.

Python example:

class Engine:
    def start(self): return "Engine running"

class Wheels:
    def roll(self): return "Rolling"

class Car:
    def __init__(self):
        self.engine = Engine()
        self.wheels = Wheels()
    
    def drive(self):
        return f"{self.engine.start()}, {self.wheels.roll()}"

Car composed of Engine and Wheels, not inheriting from them.

Bigger system example: React components - compose small components into larger ones. No deep inheritance hierarchies, just composition.


13. Uniform Access Principle

In two.lua:

local function mid(i)
  if i.mu then return i.mu 
  elseif i.has then return mode(i.has) 
  elseif i.rows then
    i._mid = i._mid or mode(i.has)
    return i._mid end end

Same interface mid(i) works for NUM, SYM, or DATA.

Python example:

class Temperature:
    def __init__(self, celsius):
        self._celsius = celsius
    
    @property
    def fahrenheit(self):
        return self._celsius * 9/5 + 32

t = Temperature(100)
print(t.fahrenheit)  # Accessed like attribute, computed like method

Bigger system example: Ruby - array.length and string.length use same interface, don't care if it's a method or property.


14. Command-Query Separation

In two.lua:

-- Commands (modify state, but also return for chaining)
local function add(i,v,  inc) ... end  

-- Queries (just read, never modify)
local function mid(i) ...              
local function distx(i,row1,row2) ...  

Python example:

class Stack:
    # Command - modifies state
    def push(self, item):
        self.items.append(item)
    
    # Query - just reads
    def size(self):
        return len(self.items)
    
    # Command that also returns (useful for chaining)
    def pop(self):
        return self.items.pop()

Bigger system example: SQL - SELECT queries don't modify data, INSERT/UPDATE/DELETE commands do. Clear separation.


15. Fail Fast

In two.lua:

src = assert(io.open(file))

Die immediately if file missing, don't propagate errors everywhere.

Python example:

def divide(a, b):
    assert b != 0, "Cannot divide by zero"
    return a / b

def process_user(user):
    assert user is not None, "User cannot be None"
    assert user.email, "User must have email"
    return send_email(user.email)

Bigger system example: Rust's unwrap() - panic immediately on None/Err rather than silently propagating null through the system.


16. Duck Typing / Polymorphism via Duck Typing

In two.lua:

local function add(i,v,  inc)
  ...
  if i.mode then i.has[v] = inc + (i.has[v] or 0) 
  elseif i.mu then ...

"If it has .mu, treat it as NUM; if it has .mode, treat it as SYM"

Python example:

def process(file_like):
    data = file_like.read()
    file_like.close()
    return data

# Works with real files, StringIO, network sockets, etc.
from io import StringIO
process(open('data.txt'))
process(StringIO("hello"))

Bigger system example: Python's file-like objects - anything with .read(), .write(), .close() can be treated as a file. StringIO, network sockets, HTTP responses all work the same.


17. Postel's Law (Robustness Principle)

In two.lua:

local function aha(col,v1,v2)
  if v1=="?" and v2=="?" then return 1 end
  if col.mode then return v1==v2 and 0 or 1 end
  v1 = v1 ~= "?" and v1 or (v2 > 0.5 and 0 or 1)
  v2 = v2 ~= "?" and v2 or (v1 > 0.5 and 0 or 1)
  return math.abs(v1 - v2) end

Liberal in what you accept - handles missing values gracefully.

Python example:

def safe_int(value, default=0):
    """Accept strings, floats, None - be liberal"""
    if value is None:
        return default
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

safe_int("42")      # 42
safe_int("hello")   # 0
safe_int(None)      # 0
safe_int(3.14)      # 3

Bigger system example: HTML parsers - browsers accept malformed HTML and do their best to render it. Conservative in output (valid HTML), liberal in input (broken HTML).


18. Principle of Least Knowledge (Law of Demeter)

In two.lua:

return function(row)  return d(row,best) - d(row, rest) end

two() returns a function, not exposing internal best/rest structures.

Python example:

# Bad: reaches through multiple objects
customer.wallet.money.subtract(price)

# Good: ask, don't reach
customer.pay(price)

class Customer:
    def pay(self, amount):
        self.wallet.deduct(amount)

Bigger system example: jQuery - $('#id').find('.class').hide() - each method only knows about what it returns, not the whole DOM tree structure.


19. Make Illegal States Unrepresentable

In two.lua:

function NUM(at,s) 
  return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0,
          best=(tostring(s) or ""):find"+$" and 1 or 0} end

function SYM(at,s) return {at=at, of=s, n=0, has={}, mode=0, most=-1} end

NUM has mu/m2/sd, SYM has has/mode - can't accidentally mix them.

Python example:

from enum import Enum

class Status(Enum):
    PENDING = 1
    APPROVED = 2
    REJECTED = 3

# Can't accidentally set status to "maybe" or 42
order.status = Status.APPROVED  # OK
order.status = "approved"       # Type error

Bigger system example: Rust's type system - Option<T> means "might be None", forcing you to handle the case. Can't accidentally use null.


20. Worse is Better

In two.lua:

local function mode(d,   v,n)
  v,n = nil,0
  for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end
  return v end 

O(n) scan, not a maintained heap. Simple, works, good enough.

Python example:

# Simple O(n²) solution for small lists
def find_duplicates(items):
    dups = []
    for i, item in enumerate(items):
        if item in items[i+1:]:
            dups.append(item)
    return dups

For small lists, clarity beats optimization.

Bigger system example: C's malloc/free vs garbage collection. Simple, predictable, worse than GC in some ways but wins for systems programming.


21. Referential Transparency

In two.lua:

local function distx(i,row1,row2,     d)
  d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
  return sqrt(d/#i.cols.x) end

local function norm(i,v)
  return 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end

Pure functions - same inputs always give same outputs.

Python example:

# Referentially transparent
def add(a, b):
    return a + b

# Not referentially transparent
counter = 0
def increment():
    global counter
    counter += 1
    return counter

Bigger system example: Functional programming languages like Haskell - pure functions are the default, side effects are explicit and tracked by type system.


22. Lazy Evaluation / Generators

In two.lua:

local function csv(file,    src)
  src = assert(io.open(file))
  return function(    s)
    s = src:read()
    if s then return s2a(s) else src:close() end end end

Returns a function that reads one line at a time, not all at once.

Python example:

def fibonacci():
    a, b = 0, 1
    while True:
        yield a
        a, b = b, a + b

# Only computes what you use
for i, fib in enumerate(fibonacci()):
    if i > 10: break
    print(fib)

Bigger system example: Python's range() - range(1000000) doesn't create a million numbers in memory, it creates a generator that yields them one at a time.


23. Factory Pattern

In two.lua:

function COLS(row,    t,x,y,all,col)
  x,y,all = {},{},{}
  for n,s in ipairs(row) do
    col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
    all[n] = col

The column name determines which constructor to call (NUM or SYM).

Python example:

def create_logger(log_type):
    if log_type == "file":
        return FileLogger()
    elif log_type == "console":
        return ConsoleLogger()
    elif log_type == "network":
        return NetworkLogger()

logger = create_logger("file")

Bigger system example: Django's database backends - connection.cursor() returns different cursor objects (PostgreSQL, MySQL, SQLite) based on configuration.


24. Builder Pattern (Fluent Interface)

In two.lua:

add(seen, add(best, row))

Chaining operations - add returns the added value so you can immediately add it somewhere else.

Python example:

class QueryBuilder:
    def __init__(self):
        self.query = ""
    
    def select(self, fields):
        self.query += f"SELECT {fields} "
        return self
    
    def from_table(self, table):
        self.query += f"FROM {table} "
        return self
    
    def where(self, condition):
        self.query += f"WHERE {condition}"
        return self

# Fluent chaining
query = QueryBuilder().select("*").from_table("users").where("age > 18")

Bigger system example: jQuery: $('#element').fadeIn().addClass('active').slideDown() - each method returns the object so you can chain.


25. Strategy Pattern

In two.lua:

local function distys(i,  rows,      y)
   y = function(row) return disty(i, row) end
   return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end

Pass in different comparison functions to get different sorting behaviors.

Python example:

class PaymentProcessor:
    def __init__(self, strategy):
        self.strategy = strategy
    
    def process(self, amount):
        return self.strategy.pay(amount)

class CreditCard:
    def pay(self, amount): return f"Paid ${amount} with credit card"

class PayPal:
    def pay(self, amount): return f"Paid ${amount} with PayPal"

processor = PaymentProcessor(CreditCard())
processor.process(100)

Bigger system example: JavaScript's Array.sort() - pass any comparison function: items.sort((a,b) => a.price - b.price).


26. Default Arguments / Sensible Defaults

In two.lua:

function NUM(at,s) 
  return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0,
          best=(tostring(s) or ""):find"+$" and 1 or 0} end

Uses or to provide defaults when arguments are nil.

Python example:

def greet(name="World", greeting="Hello"):
    return f"{greeting}, {name}!"

greet()                          # "Hello, World!"
greet("Tim")                     # "Hello, Tim!"
greet("Tim", "Hi")              # "Hi, Tim!"
greet(greeting="Hey")           # "Hey, World!"

Bigger system example: Python's requests.get() - requests.get('http://example.com') has sensible defaults for timeout, headers, SSL verification.


27. Null Object Pattern

In two.lua:

local function add(i,v,  inc)
  if v == "?" then return v end
  inc = inc or 1
  ...

"?" is treated as a special null/missing value. Code handles it gracefully.

Python example:

class NullLogger:
    def log(self, msg): pass
    def error(self, msg): pass

class RealLogger:
    def log(self, msg): print(f"LOG: {msg}")
    def error(self, msg): print(f"ERROR: {msg}")

# No if-checks needed
logger = NullLogger() if quiet_mode else RealLogger()
logger.log("Starting process")  # Works either way

Bigger system example: NumPy's NaN handling - operations on NaN propagate gracefully: np.mean([1, 2, np.nan, 4]) returns nan rather than crashing.


28. Immutable Core, Mutable Shell

In two.lua:

-- Immutable/pure calculations
local function distx(i,row1,row2,     d)
  d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
  return sqrt(d/#i.cols.x) end

-- Mutable updates
local function add(i,v,  inc)
  i.n = i.n + inc
  ...

Python example:

from dataclasses import dataclass

@dataclass(frozen=True)
class Point:
    x: int
    y: int
    
    def distance_to(self, other):
        return ((self.x - other.x)**2 + (self.y - other.y)**2)**0.5

# Core data is immutable, but can build new points
p1 = Point(0, 0)
p2 = Point(3, 4)
# p1.x = 5  # Error! Frozen

Bigger system example: React's virtual DOM - rendering functions are pure (props in → virtual DOM out), but the framework handles mutations to the real DOM separately.


29. Caching / Memoization

In two.lua:

local function mid(i)
  ...
  elseif i.rows then
    i._mid = i._mid or mode(i.has)
    return i._mid end end

Cache the computed _mid value. Compute once, reuse many times.

Python example:

from functools import lru_cache

@lru_cache(maxsize=128)
def fibonacci(n):
    if n < 2: return n
    return fibonacci(n-1) + fibonacci(n-2)

# Exponential → linear with one line
fibonacci(100)  # Fast!

Bigger system example: Redis - entire databases dedicated to caching. Web apps cache database queries, API responses, rendered HTML.


30. Small Functions (Extract Method)

In two.lua:

local function mode(d,   v,n)
  v,n = nil,0
  for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end
  return v end 

local function mid(i)
  if i.mu then return i.mu 
  elseif i.has then return mode(i.has) 
  ...

Python example:

def process_order(order):
    validate_order(order)
    charge_customer(order)
    send_confirmation(order)
    update_inventory(order)

def validate_order(order):
    assert order.items, "Order must have items"
    assert order.customer, "Order must have customer"

Bigger system example: Unix philosophy - cat log | grep ERROR | sort | uniq -c | sort -rn | head - small tools composed.


31. Guard Clauses (Early Return)

In two.lua:

local function add(i,v,  inc)
  if v == "?" then return v end
  inc = inc or 1
  i.n = i.n + inc
  if i.mode then i.has[v] = inc + (i.has[v] or 0) 
  elseif i.mu then ...

Python example:

def process_payment(amount, account):
    if amount <= 0:
        return "Invalid amount"
    
    if not account:
        return "No account"
    
    if account.balance < amount:
        return "Insufficient funds"
    
    # Happy path has minimal nesting
    account.balance -= amount
    return "Success"

Bigger system example: Express.js middleware - authentication checks return early:

function requireAuth(req, res, next) {
  if (!req.user) return res.status(401).send('Unauthorized');
  next();
}

32. Intentional Naming

In two.lua:

local function distx(i,row1,row2,     d)  -- distance in X space
local function disty(i,row,     d)        -- distance in Y space
local function distys(i,  rows,      y)   -- sort all by Y distance

Names tell you exactly what space you're operating in.

Python example:

# Bad
def calc(x, y):
    return x * y * 0.7

# Good
def calculate_discounted_price(original_price, quantity, discount_rate=0.7):
    return original_price * quantity * discount_rate

Bigger system example: Ruby on Rails - User.find(id) finds one, User.where(name: 'Tim') finds many, user.save persists.


33. Closure (Lexical Scope Capture)

In two.lua:

return function(row)  return d(row,best) - d(row, rest) end

The returned function "closes over" d, best, and rest from the enclosing scope.

Python example:

def make_multiplier(factor):
    def multiply(x):
        return x * factor  # Captures 'factor' from outer scope
    return multiply

times_three = make_multiplier(3)
times_five = make_multiplier(5)

print(times_three(10))  # 30
print(times_five(10))   # 50

Bigger system example: JavaScript event handlers - button.onclick = () => this.handleClick() captures this from the surrounding context.


34. Optional Chaining / Safe Navigation

In two.lua:

best=(tostring(s) or ""):find"+$" and 1 or 0

Convert to string, or use empty string if nil. Chain safely without crashes.

Python example:

# Using dict.get() for safe navigation
user_name = user.get('profile', {}).get('name', 'Anonymous')

# Or with walrus operator
if (profile := user.get('profile')) and (name := profile.get('name')):
    print(name)

Bigger system example: JavaScript's optional chaining: user?.address?.street?.name - if any part is undefined, entire expression becomes undefined instead of throwing TypeError.


35. KISS (Keep It Simple, Stupid)

In two.lua:

local function cli(d,funs)
  for i,s in pairs(arg) do
    if funs[s]
    then funs[s](coerce(arg[i+1])) ...

The command line argument parser is a simple loop checking a dictionary. No complex flag parsing libraries used.

Python example:

# Simple config loading
import json
def load_config(path):
    with open(path) as f:
        return json.load(f)

Don't use a heavy configuration management library when a simple JSON load suffices.

Bigger system example: Redis - uses a simple text-based protocol (RESP) that is human-readable and easy to parse, rather than a complex binary protocol.


36. YAGNI (You Aren't Gonna Need It)

In two.lua:

local function two(data) 
  -- ... logic for clustering ...
  return function(row) return d(row,best) - d(row, rest) end end

The code calculates clusters but doesn't implement features to "save" the model to disk or "export" to JSON. It runs, outputs, and exits.

Python example:

class User:
    def __init__(self, name):
        self.name = name
        # YAGNI: Don't add address, phone, ssn until actually needed

Bigger system example: Extreme Programming (XP) - emphasizes implementing only the user stories scheduled for the current iteration, never building infrastructure for future hypothetical requirements.


37. Avoid Premature Optimization

In two.lua:

local function distx(i,row1,row2,     d)
  d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
  return sqrt(d/#i.cols.x) end

Calculates distance on-the-fly every time. It does not cache a distance matrix (which would consume $O(N^2)$ memory) because the dataset size in this context doesn't warrant it yet.

Python example:

# Write clear code first, optimize later
total = sum(item.price for item in cart)
# Don't switch to numpy arrays unless 'cart' has millions of items

Bigger system example: Donald Knuth's famous quote regarding the layout of TeX: "Premature optimization is the root of all evil." He focused on correctness first, optimizing only the critical hotspots later.


38. Optimize for Reading, Not Writing

In two.lua:

local fmt = string.format
local function o(v, ...) -- complex stringification logic

The o function is complex to write, but it ensures that the output (and the code using it) is readable and clean.

Python example:

# Verbose to write, easy to read
if user.is_active and user.has_permission and not user.is_blocked:
    grant_access()

# Hard to read (Code golf)
if all([u.a, u.p, not u.b]): g()

Bigger system example: Python itself - strictly enforces indentation. It makes writing code slightly stricter, but guarantees that all code looks visually similar, optimizing for the reader.


39. Minimize Cognitive Load

In two.lua:

local help = [[
two.lua : stochastic incremental XAI
...
Options:
  -h             Show help.
  -b  bins=7     Number of bins for discretization. ]]

All configuration options are visible in one place at the top of the file. You don't have to hunt through 5 files to find the settings.

Python example:

# Facade pattern helps minimize load
# Instead of importing 10 classes, import one
from my_library import easy_api
easy_api.run()

Bigger system example: Go (Golang) - The language specification is small enough to hold in your head. It lacks features like generics (historically) or operator overloading to keep the mental model of the code simple.


40. Structured Programming

In two.lua:

local function distys(i,  rows,      y)
   y = function(row) return disty(i, row) end
   return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end

Uses clear block structures (functions, scoped variables) and higher-order functions instead of goto or spaghetti jumps.

Python example:

# Structured control flow
try:
    process_data()
except Error:
    handle_error()
finally:
    cleanup()

Bigger system example: Dijkstra's "Go To Statement Considered Harmful" - the foundation of modern languages (Java, C#, etc.) which enforce structured loops and blocks over arbitrary jumps.


41. No Magic Numbers

In two.lua:

local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end
...
if n > 256 then break end  -- Wait, 256 is magic!

Critique: two.lua actually violates this in if n > 256. Correction: It moves most numbers to help string (bins=7, era=30), making them named constants in the.

Python example:

# Bad
time.sleep(86400)

# Good
SECONDS_IN_DAY = 86400
time.sleep(SECONDS_IN_DAY)

Bigger system example: HTTP Status Codes - We use HTTP_200_OK or HTTP_404_NOT_FOUND in constants files rather than hardcoding 200 or 404 throughout the application logic.


42. Strict in Types, Loose in Values

In two.lua:

local function coerce(s) 
  if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end

The system is strict about needing a value (or a default), but loose in accepting a string and converting it to a number if it looks like one.

Python example:

def add_to_cart(item_id):
    # Accepts int or string "123", converts to int 123
    id = int(item_id) 
    ...

Bigger system example: REST APIs - Often accept "true", "True", or boolean true in JSON payloads for boolean fields to be accommodating to different clients.


43. Name Things Once

In two.lua:

local help = [[ ... -b  bins=7 ... ]]
-- The name "bins" is defined in the string, parsed into 'the.bins'.
-- We don't manually type 'the.bins = 7' separately.

The definitions in the help string drive the logic. The variable name exists in one place.

Python example:

from collections import namedtuple

# Define field names once
Point = namedtuple('Point', ['x', 'y'])
p = Point(10, 20)
# Use p.x, p.y - names are consistent

Bigger system example: Terraform - You define a resource name (e.g., aws_instance.web) once, and reference that symbolic name elsewhere in the infrastructure definition.


44. Choose Boring Technology

In two.lua:

-- No "require 'torch'" or "require 'lfs'"
-- Uses only standard math, table, string, and io libraries.

The script runs on standard Lua. It doesn't require complex package managers or bleeding-edge compilers.

Python example:

# Sticking to standard library when possible
import datetime # Built-in, boring, reliable
# vs
import arrow # Better API, but an extra dependency

Bigger system example: PostgreSQL - Startups often choose Postgres (boring, reliable, standard) over niche new NoSQL databases because "boring" means "it won't lose my data at 3 AM."


45. Tell, Don’t Ask

In two.lua:

local function add(i,v,  inc)
  -- Logic for how to add is INSIDE the object 'i' (via the function)
  -- We don't ask "is i a NUM?" then do math.
  if i.mode then i.has[v] = ...
  elseif i.mu then ...

You tell the add function to process v, and the internal logic decides how to handle it based on the object's structure.

Python example:

# Bad (Asking)
if wallet.balance > amount:
    wallet.balance -= amount

# Good (Telling)
wallet.debit(amount) # Internal check raises error if insufficient

Bigger system example: Microservices - You send a command "Process Order" to the Order Service. You don't query the Order Service database, check the status, and then write a new status yourself.


46. Localize Side Effects

In two.lua:

math.randomseed(the.seed)

The randomness is seeded once at the very top level. Functions like distx are pure calculations; they don't change global state or print to the console.

Python example:

def main():
    # Side effects (IO, DB) only in main/controller
    data = read_file()
    result = pure_calculation(data)
    write_file(result)

def pure_calculation(data):
    return data * 2 # No print statements here

Bigger system example: Redux (React) - Reducers are pure functions with zero side effects. All side effects (API calls) are localized in "Thunks" or "Sagas".


47. Avoid Temporal Coupling

In two.lua:

local function two(data) 
  local train,test,start,todo,seen,best,rest,d
  shuffle(data.rows)
  train,test = cut(data.rows, data.n//2)
  -- ...

The two function initializes train, test, seen right before using them. It doesn't rely on a global init() having been called 5 minutes earlier.

Python example:

# Bad: Order matters implicitly
obj.init()
obj.load()
obj.run()

# Good: Constructor handles it
obj = Runner(data) # Ready to go
obj.run()

Bigger system example: Dependency Injection Containers - They ensure that when you request a service, all its dependencies are already created and wired together, removing manual temporal setup steps.


48. Avoid Boolean Parameters

In two.lua:

local function cut(a0,n,  data)
  -- 'data' is an optional object, not a boolean 'is_data'
  if data then return clone(data,a1),clone(data,a2) end

Instead of passing true to say "return data objects", it passes the data object itself. If it's nil, it returns lists.

Python example:

# Bad
def create_user(name, is_admin=False): ...

# Good (Enums or separate methods)
def create_user(name, role=Role.USER): ...
def create_admin(name): ...

Bigger system example: Windows API (bad example) - CreateFile takes many boolean flags, making calls cryptic (true, false, true, false). Modern APIs prefer passing a config object or specific types.


49. Design for Debuggability

In two.lua:

local function o(v,     list,dict)--> s;; Make anything a string.
  -- This entire function exists solely to make complex tables 
  -- human-readable for debugging/printing.

The code includes a robust "toString" equivalent (o) specifically to make internal state visible.

Python example:

class User:
    def __repr__(self):
        return f"<User id={self.id} email={self.email}>"
    # Now print(user) gives useful info, not <object at 0x123>

Bigger system example: Chrome DevTools - The entire web ecosystem is built with inspection tools in mind (Elements panel, Console, Network tab) to allow developers to peek into the runtime.


50. Error Handling Is Control Flow

In two.lua:

local function csv(file,    src)
  src = assert(io.open(file)) -- Stops flow if file fails
  return function(    s)
    s = src:read()
    if s then return s2a(s) else src:close() end end end

Using assert halts the program immediately if the file is invalid. The closure uses if s then ... else close to control the loop termination.

Python example:

try:
    process_payment()
except InsufficientFunds:
    redirect_to_wallet() # Error drives the UX flow

Bigger system example: Erlang/Elixir Supervisors - The "Let it Crash" philosophy. If a process errors, it crashes, and a supervisor catches that crash (control flow) to restart it.


51. Zero-One-Infinity Rule

In two.lua:

for n,s in ipairs(row) do
  -- Handles ANY number of columns. 
  -- Not restricted to "2 columns" or "10 columns".

The code works for 0 columns (empty), 1 column, or N columns. It doesn't arbitrary cap the schema size.

Python example:

# Allow list of any size
def process_items(items):
    for item in items:
        ...
# Don't create variables item1, item2, item3

Bigger system example: UNIX File Descriptors - You can open as many files as memory/OS limits allow. The system doesn't arbitrarily limit you to "3 open files."


52. CLI Modularity

In two.lua:

local function cli(d,funs)
  for i,s in pairs(arg) do
    if funs[s]
    then funs[s](coerce(arg[i+1])) ...

The CLI logic cli is generic. It takes a table of functions funs (egs). You can add new commands just by adding to egs, without changing the cli parser.

Python example:

import argparse
# Argparse handles the parsing logic separate from your business logic
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true')

Bigger system example: Git subcommands (git commit, git add) - Git is structured so that many subcommands are actually standalone binaries (git-commit, git-add) invoked by the main wrapper.


53. Pattern Matching (Simulated)

In two.lua:

local function add(i,v,  inc)
  if i.mode then ... -- It matches a SYM type
  elseif i.mu then ... -- It matches a NUM type

Lua lacks native pattern matching, so it simulates it by checking for the existence of unique keys (mode vs mu).

Python example:

# Python 3.10+ Pattern Matching
match shape:
    case Circle(r):
        return 3.14 * r * r
    case Rectangle(w, h):
        return w * h

Bigger system example: Rust match - The primary way to handle control flow. It forces you to handle every variant of an Enum, ensuring safety.


54. Variable Scoping (Local by Default)

In two.lua:

local DATA, NUM, SYM, COLS, clone, adds
local exp,sqrt,log  = math.exp, math.sqrt, math.log

Everything is declared local. This prevents polluting the global namespace and is faster in Lua.

Python example:

def my_func():
    x = 10 # Local variable
    global y # Explicitly asking for global (discouraged)

Bigger system example: JavaScript (let/const vs var) - Modern JS moved to block scoping (let) to avoid the issues caused by function-scoped or global variables (var).

You are right; while "Hungarian Notation" is the classic reference for encoding types in names, your header.md defines a much lighter, more ergonomic variant (e.g., n for number, s for string) that reduces visual noise while keeping the benefits.

Here are the missing heuristics (55–60) to complete your catalog, including the Layered Architecture, TDD, and the specific Type Hinting breakdown you requested.


55. Test-Driven Development (TDD)

In two.lua:

local function runs(  out)
  for k, fun in pairs(eg) do -- 'eg' is a table of test functions
    math.randomseed(the.seed) -- Reset seed before every test
    out = fun()
    if out == false then print("FAIL", k) else print("PASS", k) end
  end
end

The test runner is built into the file itself. You write the test in eg before the code to define the expected behavior.

Python example:

import pytest

# Write this FIRST (Red)
def test_calculator_add():
    assert add(2, 3) == 5

# Write this SECOND (Green)
def add(a, b):
    return a + b

Bigger system example: CI/CD Pipelines (GitHub Actions): Modern infrastructure won't let you merge code unless the test suite passes. The tests act as the "gatekeeper" of truth, enforcing that new code doesn't break old features.


56. Layered Architecture: Data Independence

In two.lua:

-- The logic (DATA class) doesn't know where data comes from.
function DATA:new(src)
  self.rows = {}
  if type(src) == "string" then csv(src, function(row) self:add(row) end) -- CSV file
  else for _,row in pairs(src or {}) do self:add(row) end                 -- In-memory table
  end 
end

The business logic (DATA) is decoupled from the storage mechanism. It works equally well with a CSV filename or a raw Lua table.

Python example:

# The Repository Pattern
class UserRepository:
    def get(self, id):
        # Could be SQL, could be JSON, could be Redis.
        # The calling code doesn't need to know.
        return db.execute("SELECT * FROM users WHERE id=?", id)

user = repo.get(1) # Business logic is protected from SQL syntax

Bigger system example: ODBC / JDBC: These drivers allow applications to connect to SQLite, Postgres, or Oracle using the exact same function calls. The database implementation is completely hidden from the application layer.


57. Layered Architecture: Dialog Independence

In two.lua:

-- Pure Logic (Core)
local function dist(r1, r2) return (r1.x - r2.x)^2 end

-- Presentation Layer (CLI)
local function cli() 
  print(dist(req_row(), db_row())) 
end
-- If we moved to a GUI, 'dist' would not change. Only 'cli' would be replaced.

The core algorithms do not contain print statements or GUI widgets. They return data, which the presentation layer decides how to display.

Python example:

# Core Logic
def calculate_tax(income):
    return income * 0.3

# Web Presentation (Flask)
@app.route('/tax/<int:income>')
def web_tax(income):
    return f"<h1>Tax: {calculate_tax(income)}</h1>"

# CLI Presentation (Click)
@click.command()
def cli_tax(income):
    print(f"Tax is: {calculate_tax(income)}")

Bigger system example: X11 Window System / React Native: The logic runs in one place (the client or the JS thread), and the "view" can be a Linux Desktop, an iOS screen, or an Android screen. The core logic is independent of the dialog toolkit.


58. Functional Programming (Higher-Order Functions)

In two.lua:

local function map(t, fun,    u) 
  u={}; for k,v in pairs(t) do u[k]=fun(v) end; return u 
end

-- Usage: Pass behavior as an argument
local squared = map({1,2,3}, function(x) return x*x end)

Functions are first-class citizens. You can pass logic (functions) around just like data.

Python example:

# Passing behavior into a function
numbers = [1, 2, 3, 4]
evens = list(filter(lambda x: x % 2 == 0, numbers))
doubled = list(map(lambda x: x * 2, numbers))

Bigger system example: React Hooks / Callbacks: useEffect(() => { doSomething() }). You pass a function to the framework, and the framework calls it back when the time is right.


59. Type Hinting (Hungarian vs. Lite vs. Modern)

In two.lua (Lite Hungarian): Instead of verbose "System Hungarian" (lpszName), two.lua uses a "Lite" schema defined in the header:

-- n=number; s=string; b=boolean
-- a,d = array,dict 
-- ds,dn = dict of strings, dict of numbers

local function add(n, s, t) ... end
-- 'n' tells us it expects a number
-- 's' tells us it expects a string
-- 't' tells us it expects a table

This reduces cognitive load without the visual clutter of full Hungarian notation.

Classic Hungarian (Historical Context):

// Apps Hungarian (Simonyi) - prefix indicates 'kind'
char  *szName;   // Zero-terminated String
int    iCount;   // Integer
long   lIndex;   // Long

Python example (Modern Type Hints): Modern languages moved the hint from the variable name to the syntax itself.

# Modern Type Hinting
def connect(timeout: int, retries: int = 3) -> bool:
    return True

The code documents itself. IDEs can now catch errors before you run the code.

Bigger system example: TypeScript: JavaScript was "loose," causing millions of runtime errors. TypeScript added a layer of strict types on top, becoming the industry standard for large web apps because it catches errors at compile time.


60. Documentation as Contract

In two.lua:

local help = [[
NAME:
  two.lua
SYNOPSIS:
  Classifies rows into 'best' or 'rest'.
INPUTS:
  csv file with headers. 
  Uppercase headers are numerics.
]]

The documentation isn't separate from the code; it is embedded in the code, often driving the configuration parsing (as seen in Rule 1/7).

Python example:

def square(n):
    """
    Returns the square of n.
    
    >>> square(2)
    4
    >>> square(-3)
    9
    """
    return n * n

Python Docstrings (""") allow tools to auto-generate websites (Sphinx) and run tests (Doctest) directly from the documentation.

Bigger system example: OpenAPI (Swagger): You write a YAML file describing your API. This file generates the documentation and the code and the testing tools. The documentation is the code.

Yes, absolutely. In software engineering, "personnel" and "process" heuristics are arguably more critical than syntax because they govern how the code actually gets written and maintained by humans.

Here are the heuristics for tackling complexity, maintaining motivation, and sorting priorities, formatted to match your existing catalog.


61. Gall's Law (Tackling Big Problems)

In two.lua:

-- Start simple. A complex system that works is invariably found to have 
-- evolved from a simple system that worked.
function DATA:new(src)
  self.rows = {} 
  -- Version 1: Just load the data. Worry about discretization later.
  if src then self:load(src) end
end

Do not try to build the complex version (bins, entropy, discretization) first. Build the version that just loads the file. If that works, add the next layer.

Python example:

# The "Walking Skeleton"
# Don't build the whole API. Build one endpoint that returns "Hello World".
# Ensure the database, server, and network connect before adding logic.

@app.get("/")
def health_check():
    return {"status": "ok"} 

Bigger system example: The MVP (Minimum Viable Product): Twitter started as a simple SMS service. Amazon started as a list of books. You cannot design a complex system from scratch; you must evolve it from a working simple system.


62. Weighted Shortest Job First (Agile Prioritization)

In two.lua:

-- Sorting the backlog (simulated)
local tasks = {
  {name="fix_typo",  cost=1,  val=1,  ratio=1},
  {name="crit_bug",  cost=5,  val=50, ratio=10}, -- Do this first!
  {name="new_feat",  cost=100,val=20, ratio=0.2}
}
table.sort(tasks, function(a,b) return a.ratio > b.ratio end)

To escape the "slump" of an infinite backlog, you don't sort by "importance" (Value) or "ease" (Cost). You sort by Cost of Delay (Value / Duration). This gives you the biggest bang for the buck immediately.

Python example:

# The Eisenhower Matrix in code
def get_next_task(tasks):
    urgent = [t for t in tasks if t.urgent and t.important]
    if urgent: return urgent[0]
    
    plan = [t for t in tasks if not t.urgent and t.important]
    return plan[0] # Schedule these

Bigger system example: SAFe (Scaled Agile Framework): Large enterprises formally calculate WSJF = (User Value + Time Criticality) / Job Size. This removes the "loudest person in the room" bias and provides a mathematical way to pick what to work on next.


63. Short Feedback Loops (Escaping Motivational Slumps)

In two.lua:

-- The Repl (Read-Eval-Print Loop) approach
-- Don't write 100 lines. Write 1, print it.
local n = 10
print(n) -- Sanity check: I am not crazy, the computer is listening.

Motivational slumps often come from working in the dark for too long. The cure is to shorten the loop between "I type code" and "I see result."

Python example:

# TDD Red/Green loop
# 1. Write a failing test (Instant feedback: "It failed")
# 2. Write just enough code to pass (Instant feedback: "It passed")
# The dopamine hit of the "Green Bar" keeps you moving.

Bigger system example: Hot Reloading (React/Flutter/Vite): In modern web dev, you save the file and the browser updates instantly without a refresh. This keeps the developer in the "Flow State," preventing the mind from wandering during compilation times.


64. The Bus Factor (Personnel Patterns)

In two.lua:

-- This code is written using 'Lite Hungarian'
-- so that if 'timm' gets hit by a bus, someone else knows that
-- 'nC' is a number representing a count.
local function bins(nC, sName) ... end

Code readability is not about the computer; it is about the next human. If only one person understands the code, the project is fragile.

Python example:

# Docstrings are knowledge transfer
def calculate_variance(data):
    """
    Uses Welford's online algorithm to avoid catastrophic cancellation.
    Reference: Knuth Vol 2, p 232.
    """
    # The comment explains *why* we didn't just use sum(x^2), 
    # saving the next developer from "refactoring" it back to a buggy version.

Bigger system example: Pair Programming / Code Review: In systems like Linux or Google's Monorepo, no code enters the codebase without being read by a second person. This ensures that knowledge is distributed, "escaping" the head of a single developer.


65. Conway's Law (Organization Architecture)

In two.lua:

-- The code is split into DATA, NUM, SYM.
-- If three different people work on this, they will naturally 
-- create three distinct modules to avoid stepping on each other's toes.

"Organizations which design systems are constrained to produce designs which are copies of the communication structures of these organizations."

Python example:

# Microservices often reflect team structures.
# Team A owns the 'User' service.
# Team B owns the 'Payment' service.
# They communicate via API (HTTP) because the teams talk via Slack/Meetings.
import requests
response = requests.get('http://payment-service/api/v1/charge')

Bigger system example: Amazon's "Two-Pizza Teams": Amazon structures its teams so they are small enough to be fed by two pizzas. Consequently, their software architecture is composed of thousands of small, decoupled services. If they had a giant team, they would have built a giant monolith.

You are absolutely right to keep the momentum. The Spotify example is a perfect real-world analogy for the Space-Time tradeoff because it translates "Memory vs. CPU" into terms everyone understands: "Storage vs. Bandwidth."

Here is the final, polished set of the operational heuristics (66–70), including the corrected Space-Time (with Spotify) and Idempotency entries.


66. Fail Fast (Defensive Programming)

In two.lua:

function NUM:add(n)
  -- Don't wait for the math to return NaN later. 
  -- Stop execution immediately if the input is bad.
  assert(type(n) == "number", "NUM:add expects a number")
  self.n = self.n + 1
  ...
end

If the state is invalid, crash immediately. Debugging a crash at the source is 100x easier than debugging "why is my final calculation slightly off" three hours later.

Python example:

def process_age(age):
    if age < 0:
        raise ValueError("Age cannot be negative") # Stop right here.
    return math.log(age)

Bigger system example: Erlang "Let It Crash": In telecom systems (and WhatsApp), instead of trying to recover from a corrupted state, a process kills itself immediately. The supervisor detects the death and spawns a fresh, clean replacement instantly.


67. Space-Time Tradeoff (Memoization with Invalidation)

In two.lua:

-- 1. THE CACHE: mid() calculates the central tendency (centroid) of all columns.
-- This is expensive (it loops over every column).
local function mid(i)
  -- If we are a DATA object (have rows) and have a cached answer, return it.
  if i.rows and i._mid then return i._mid end
  
  -- Otherwise, calculate it and STORE it in _mid
  local t={}; for _,col in pairs(i.cols.all) do t[1+#t] = mid(col) end 
  i._mid = t 
  return i._mid 
end

-- 2. THE INVALIDATION: When data changes, we must wipe the cache.
local function add(i,v,  inc)
  if i.rows then
    i._mid = nil -- <--- DIRTY FLAG: Data changed, so old cache is invalid
    ...
  end
end

We trade memory (i._mid) to save CPU cycles. However, unlike a static cache, this one is dynamic: the moment add() modifies the data, we explicitly set i._mid = nil so the next call to mid() knows to re-calculate.

Python example:

class Dataset:
    def __init__(self):
        self._cached_stats = None
        self.data = []

    def add(self, row):
        self.data.append(row)
        self._cached_stats = None # Invalidate cache

    @property
    def stats(self):
        if self._cached_stats is None:
            # Expensive calculation happens only when needed
            self._cached_stats = calculate_heavy_stats(self.data)
        return self._cached_stats

Bigger system example: Spotify Local Cache: Spotify downloads your frequently played songs to your device's hard drive (Space). When you press play, it reads from the disk instead of streaming from the internet. This trades Storage Space (on your phone) to save Time/Bandwidth (network latency).


68. The Law of Demeter (Principle of Least Knowledge)

In two.lua:

-- BAD: Reaching through objects
-- print(myModel.rows[1].cells[3]) 

-- GOOD: Ask the object to do the work for you
print(myModel:getCell(1, 3))

An object should only talk to its immediate friends. If you change how rows are stored (e.g., from an array to a database cursor), the "Bad" code breaks. The "Good" code survives because the implementation is hidden.

Python example:

# Bad
user.wallet.credit_card.charge(100)

# Good
user.make_payment(100) # The user object knows how to handle its own wallet.

Bigger system example: Microservices API Gateways: Service A never talks directly to Service B's database. It calls Service B's API. This ensures Service B can change its database schema without breaking Service A.


69. Idempotency (Config-Driven State Reset)

In two.lua:

-- The '-s' flag does two things:
-- 1. Updates the config record (the.seed)
-- 2. IMMEDIATELY forces the runtime state (math.randomseed) to match.
egs["-s"] = function(n) 
  math.randomseed(n) -- Reset the RNG now
  the.seed = n       -- Remember this for later
end

This ensures that the command-line argument isn't just a "suggestion" for future operations; it is an imperative command to reset the universe right now. This guarantees that lua two.lua -s 42 behaves identically every single time it is run, regardless of what happened before that line was executed.

Python example:

import random
import sys

def set_seed(n):
    # Update global config AND runtime state simultaneously
    CONFIG['seed'] = n
    random.seed(n)

if __name__ == "__main__":
    if "--seed" in sys.argv:
        # User input forces immediate deterministic state
        set_seed(int(sys.argv[sys.argv.index("--seed") + 1]))

Bigger system example: Terraform / Ansible: These tools are designed to be idempotent. If you run "Ensure Server X exists" 50 times, it creates the server once and does nothing the other 49 times. It forces the reality to match the configuration, resetting state only if necessary.


70. The Pareto Principle vs. Premature Optimization

  • Premature Optimization is a rule about Time (When). "Don't optimize yet because you don't know what matters."
  • The Pareto Principle is a rule about Focus (Where). "When you do optimize, only look at the 20% of code that is running 80% of the time."

In two.lua:

-- Optimization Strategy:
-- 1. Write clear code first (Avoid Premature Opt).
-- 2. If slow, find the 'hot loop' (Pareto).
-- 3. Only optimize that loop.

function DATA:dist(row1, row2) 
   -- This function is called N^2 times. 
   -- Optimizing THIS function yields 80% of the speedup.
   -- Optimizing the CLI argument parser yields 0% of the speedup.
end

Python example:

# cProfile output
# ncalls  tottime  filename:lineno(function)
# 100000    5.000  myscript.py:20(heavy_math)  <-- The 20% to fix
#      1    0.001  myscript.py:1(setup)        <-- The 80% to ignore

Bigger system example: Bug Bounties: Microsoft found that fixing the top 20% of reported bugs eliminated 80% of the crashes and security errors. Not all bugs are created equal.

Here is the updated list. I have softened the tone on the "Root/Sudo" heuristic (#73) to be about privilege rather than "brokenness," and I have converted the specific warnings from your "Ell/Backpacker" text into a new category of Complexity & Dependency Anti-Patterns.


71. VITAL (Vital Infrastructure: Acquire Locally)

In two.lua:

-- Instead of 'require "argparse"', we write 5 lines of code:
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end

If a dependency does one simple thing (like parsing flags or left-padding a string), own it. Copy the logic into your code. Do not import a 5MB library for a 5-line function. Dependencies break over time; your own code stays stable.

Python example:

# Bad: pip install colorama (adds dependency, installation step, version conflicts)
# Good: Just define the ANSI codes you need.
class Colors:
    HEADER = '\033[95m'
    ENDC = '\033[0m'

Bigger system example: SQLite: It has zero external dependencies. It doesn't even use the standard string library of the OS if it can avoid it. This makes it compilable on a toaster.


72. The Lindy Effect (Longevity via Simplicity)

In two.lua:

-- We use CSV. Not Parquet, not Protocol Buffers, not HDF5.
-- CSV was readable in 1980. It will be readable in 2080.
function csv(sFilename, fun) ... end

The longer a technology has been around (Text, Shell, Lua, SQL), the longer it is likely to remain around. New, complex formats (.toml, .yaml, setup.py) churn every 5 years. Bet on the technology that has already survived.

Python example:

# Bad: Using a specific ORM version (SQLAlchemy 1.4)
# Good: Writing raw SQL queries. SQL is 50 years old and isn't changing.
cursor.execute("SELECT * FROM users WHERE id=?", (1,))

Bigger system example: Makefiles: Despite thousands of "better" task runners (Grunt, Gulp, Webpack, Turbolore), make is still here because it is simple, file-based, and standard.


73. User-Space Sovereignty (Principle of Least Privilege)

In two.lua:

-- Does not require 'luarocks install'. 
-- Does not require writing to /usr/bin.
-- It runs where it sits.

Tools should respect the system they run on. Requiring sudo (root) to install a text-processing tool creates security risks and friction. Good tools run comfortably in user-space (~/bin) and clean up after themselves.

Python example:

# Bad: Hardcoding paths to /var/log/myapp (Requires Root)
# Good: Adhering to XDG_DATA_HOME or local directories
log_dir = os.environ.get('XDG_DATA_HOME', './logs')

Bigger system example: AppImage / PortableApps: Applications that bundle everything they need into a single file that runs without installation, respecting the user's environment.


74. The "Hello World" Latency (Instant Start)

In two.lua:

-- Startup time is dominated by parsing the source code. 
-- No heavy VM initialization, no container spin-up.
-- Run time < 0.01s for help.

Control feels like friction if it is slow. CLI tools must start instantly. If the user hesitates to run the command because of the startup time, the tool is too heavy.

Python example:

# Bad: import pandas as pd (Takes 0.5s - 1.5s just to load)
# Good: import csv (Instant)

Bigger system example: Ripgrep (rg) vs Grep: Ripgrep is preferred by many developers not just because it searches faster, but because it starts instantly and respects .gitignore by default, reducing friction.


75. Transparency (No Magic Black Boxes)

In two.lua:

-- The model isn't a binary blob. It's a readable table of centroids.
-- You can print(DATA.stats) and read the logic with human eyes.

You cannot "own" what you cannot read. Binary formats, pickled objects, and compiled black boxes prevent you from fixing problems when the original author disappears.

Python example:

# Bad: Pickle (Python object serialization). It is opaque and dangerous.
# Good: JSON. It is verbose, but you can debug it with 'cat'.

Bigger system example: Unix /proc file system: Linux exposes kernel internals as text files. You don't need a special API to see CPU info; you just cat /proc/cpuinfo.

Anti Patterns

Found in two.lua

Based on the code in two.lua, here are common software engineering anti-patterns present in the file:

  • Magic Numbers: Hardcoded constants like 1.7, 1e-32, and 6.28 appear directly in mathematical formulas without explanation.
  • Cryptic Naming: Widespread use of single-letter variables (i, v, c, t) reduces readability for outsiders.
  • God Class: The DATA class mixes multiple responsibilities: file I/O, data storage, statistical summarization, and machine learning logic.
  • Reinventing the Wheel: Manual implementation of CSV parsing and CLI argument parsing instead of using standard libraries.
  • Global Mutable State: The global the table controls application behavior and is mutable from anywhere.
  • Command-Query Separation Violation: The add function modifies the object's state and returns the value added, blurring the line between action and query.
  • Implicit Type Coercion: The coerce function silently converts strings to numbers or booleans based on regex matching, which can hide type errors.
  • Shotgun Surgery: The add function contains conditional logic for NUM, SYM, and DATA, meaning a change to "adding" logic requires editing one complex function.
  • Stringly Typed: Configuration defaults are parsed directly from a help string rather than being defined in a structured configuration object.
  • Primitive Obsession: Complex concepts like "Rows" or "Clusters" are treated as generic Lua tables rather than distinct data types.

Complexity & Dependency Anti-Patterns (The "Modern Stack" Traps)

  • Resume-Driven Development: Choosing a complex technology (Kubernetes, React, Microservices) for a simple problem just to put it on your CV.
  • Dependency Hell: When your tool requires a library, which requires another library, which conflicts with the version of the library your OS uses.
  • Supply Chain Suicide: Referencing a package manager URL that you do not own. If the internet goes down, or the author deletes the repo (e.g., the "Left-Pad" incident), your build fails.
  • Config Porn: Having more lines of YAML/TOML configuration and CI/CD scripts than actual application logic.
  • Container Obsession: "It works on my machine because I shipped my machine." Using Docker to run a simple 50-line script is an admission of failure in portability.
  • Friction Tolerance: Accepting slow startup times, complex install procedures, or flaky builds as "normal" parts of the development process.

Management & Process Anti-Patterns

  • Death March: A project destined to fail where the team is pressured to work unsustainable hours (nights/weekends) to meet an impossible deadline.
  • Brooks’ Law: Adding manpower to a late software project makes it later (due to ramp-up time and communication overhead).
  • Gold Plating: Continuing to work on a feature well past the point where the extra effort adds any value to the user.
  • Feature Creep: The continuous addition of new features that go beyond the original scope, preventing the project from ever finishing.
  • Mushroom Management: Keeping developers in the dark (about business goals) and feeding them fertilizer (misinformation).
  • Smoke and Mirrors: Creating a demo that looks like it works but is actually hardcoded or faked behind the scenes.
  • Vendor Lock-In: becoming so dependent on a vendor's proprietary technology that switching becomes prohibitively expensive.

Architectural Anti-Patterns

  • The Golden Hammer: Assuming that a favorite technology (e.g., "Blockchain" or "AI") is the solution to every single problem.
  • Boat Anchor: Retaining a piece of code or library that is no longer used "just in case" we need it later.
  • Lava Flow: Old, poorly understood code that remains in the system because everyone is too afraid to delete it.
  • Poltergeists: Classes that have no real responsibility other than to call methods in other classes (often called Manager or Controller).
  • Big Ball of Mud: A system with no recognizable structure or architecture (the ultimate result of unchecked "Spaghetti Code").
  • Stovepipe System: Independent systems that cannot talk to each other, resulting in duplicate data entry.
  • Swiss Army Knife: A single component or interface designed to handle too many unrelated tasks (excessive complexity).

Coding Anti-Patterns

  • Soft Coding: Moving so much logic into configuration files (DB-driven behavior) that the config file becomes a programming language itself.
  • Action at a Distance: When a change in one part of the program causes an unexpected error in a completely unrelated part.
  • Blind Faith: Calling a function (especially an API or system call) and assuming it works without checking the return value or catching errors.
  • Object Orgy: Failing to encapsulate data, allowing any part of the system to modify the internals of an object directly.
  • Yo-Yo Problem: A class hierarchy so deep that you have to bounce up and down (yo-yo) between definitions to understand the code.
  • Copy-Paste Programming: Duplicating code blocks instead of creating a reusable function (violates DRY).
#!/usr/bin/env lua
local the,l,egs = {},{},{} -- config, lib, demos
local help = [[
six : stochastic incremental XAI
(c) 2025, Tim Menzies, [email protected], mit-license.org
Options:
-h Show help.
-b bins=7 Number of bins for discretization.
-e era=30 Update model every `era` number of rows.
-r ruleMax=3 Max conditions in a rule.
-s seed=42 Random number seed.
-f file=../lua6/auto93.csv ]]
-- ----------------------------------------------------------------------------
local DATA, NUM, SYM, COLS = {},{},{},{}
function DATA.new( rows, i)
i = l.new(DATA,{rows={}, cols=nil})
for _,row in pairs(rows or {}) do i:add(row) end
return i end
function DATA.clone(i, rows, clone)
clone = DATA:new({i.cols.names})
for _,row in pairs(rows or {}) do clone:add(row) end
return clone end
function NUM.new(at,s)
return l.new(NUM, {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0, bins={},
best=(tostring(s) or ""):find"+$" and 1 or 0}) end
function SYM.new(at,s)
return l.new(SYM, {at=at, of=s, n=0, bins={}, has={}, mode=0, most=-1}) end
function COLS.new(row, x,y,all,col)
x,y,all = {},{},{}
for n,s in ipairs(row) do
col = (s:match"^[A-Z]" and Num or Sym).new(n,s)
all[n] = col
if not s:match"X$" then
l.push(s:find"[+-]$" and y or x, col) end end
return l.new(COLS, {all=all, x=x, y=y, names=row}) end
-- -----------------------------------------------------------------------------
function DATA.add(i,row)
if not i.cols then i.cols = COLS.new(row) else
for _,col in pairs(i.cols.all) do col:add(row[col.at]) end
l.push(i.rows, row) end
return row end
function SYM.add(i,v)
if v~="?" then
i.n = i.n + 1
i.has[v] = 1 + (i.has[v] or 0)
if i.has[v] > i.most then i.most,i.mode = i.has[v],v end end
return v end
function NUM.add(i,n, d)
if v~="?" then
i.n = i.n + 1
d = n - i.mu
i.mu = i.mu + d / i.n
i.m2 = i.m2 + d * (n - i.mu)
i.sd = i.n<2 and 0 or (i.m2/(i.n - 1))^0.5 end
return n end
-- -----------------------------------------------------------------------------
function NUM.norm(i,v)
return 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end
function SYM.bin(_,v) return v end
function NUM.bin(i,v) return v=="?" and v or math.floor(the.bins*i:norm(v)) end
function DATA.disty(i,row, d)
d=0; for _,y in pairs(i.cols.y) do d= d + (y:norm(row[y.at]) - y.best)^2 end
return (d/#(i.cols.y)) ^ 0.5 end
function DATA.bins(i,row, v,y)
y = i:disty(row)
for _,x in pairs(i.cols.x) do
v = x:bin(row[x.at])
if v ~= "?" then
x.bins[v] = x.bins[v] or Num.new(x.at,v)
x.bins[v]:add(y) end end end
-- -----------------------------------------------------------------------------
function DATA.rule(i,stop, t,u,f,stop)
f = function(n) return math.floor(100*(n.mu + n.sd/n.n^.5)) end
t,u = {},{}
for _,x in pairs(i.cols.x) do
for _,col in pairs(x.bins) do
push(t, col) end end
for j,col in pairs(sort(t, lt(f))) do
if stop and j > stop then break end
u[col.at] = u[col.at] or {}
push(u[col.at], col) end -- tests on each attribute are disjunctions
return u end
function DATA.or(i,ors,row)
for _, col in pairs(ors) do
if col.of == i.cols.all[col.at]:bin(row[col.at]) then return true end end
return false end
function DATA.and(i,ands,row)
for _,ors in pairs(ands) do if not i:or(ors,row) then return false end end
return true end
function DATA.selects(i,rule,rows, u)
u={}; for _,r in pairs(rows) do if i:and(i,rule,r) then l.push(u,r) end end
return u end
function main(file, data1,y,train,test,rule)
data1 = Data.new()
for _,row in csv(file) do data1:add(row) end
train, test = {},{}
for j,row in pairs(shuffle(data1.rows)) do
push(j < #data.rows/2 and train or test, row) end
rule,data2 = nil, data1:clone()
y = function(row) return data2:disty(row) end
for r,row in pairs(train) do
data2:bins( data2:add(row))
if r % the.era == 0 then
rule = data2.rules( math.min(3, #data2.rows/the.era) )
print(Y(sort(data2:selects(rule,test),Y)[1])) end end end
-- -----------------------------------------------------------------------------
-- Misc tricks
function l.new(meta,t) meta.__index=meta; return setmetatable(t,meta) end
l.fmt = string.format
function l.push(t,v) t[1+#t]=v; return v end
function l.sort(t,f) table.sort(t,f); return t end
function l.lt(f) return function(a,b) return f(a) < f(b) end end
function l.shuffle(t, n)
for m= #t,2,-1 do n=math.random(m); t[m],t[n] = t[n],t[m] end
return t end
-- Thing to string
local function _a2a(a,u) for _,v in ipairs(a) do push(u,cat(v)) end;return u end
local function _d2a(d,u)
for k,v in pairs(d) do
if type(v)~=type(_d2a) then
l.push(u,l.fmt(":%s %s",k,type(v)==type(_d2a) and "" or l.cat(v))) end end
return l.sort(u) end
function l.cat(t)
return type(t) == "number" and l.fmt(t%1==0 and "%s" or "%.3f", t)
or type(t) ~= "table" and tostring(t)
or "{".. table.concat((#t>0 and _a2a or _d2a)(t,{})," ") .."}" end
function l.pat(t) print(l.cat(t)); return t end
-- String to thing
function l.coerce(s)
if s==nil then return nil end
return tonumber(s) or s:match'^%s*(.-)%s*$' end
local function _cells(s, a)
a={}; for s1 in s:gmatch"([^,]+)" do l.push(a, l.coerce(s1)) end; return a end
function l.csv(file, n,stream)
n,stream = 0,assert(io.open(file))
return function( s)
s= stream:read()
if s then n=n+1; return n,_cells(s) else stream:close() end end end
-- Config management.
function l.settings(s, d)
d={}; for k,v in s:gmatch("(%S+)=(%S+)") do d[k]=l.coerce(v) end; return d end
function l.cli(d,funs)
for i,s in pairs(arg) do
if funs[s]
then funs[s](l.coerce(arg[i+1]))
else for k,_ in pairs(d) do
if k:sub(1,1)==s:sub(2) then d[k]=l.coerce(arg[i+1]) end end end end
return d end
-- -----------------------------------------------------------------------------
egs["-h"] = function(_) print("\n"..help.."\n") end
egs["-s"] = function(n) math.randomseed(n); the.seed =n end
egs["--the"] = function(_) l.pat(the) end
egs["--guess"] = function(_) main(the.file) end
the = l.settings(help)
math.randomseed(the.seed)
if arg[0]:find"six.lua" then the=l.cli(the,egs) end
return {DATA=DATA, NUM=NUM, SYM=SYM, COLS=COLS, help=help, the=the, l=l}
#!/usr/bin/env lua
local the = {leaf=2}
local push, coerce, cells, csv, inc2, cli
local tree, show, read,_tally, _split, _div, _kids
function tree(goal,klass,rows, col,val,op)
local col1, val1, mu = _split(goal,klass, rows)
return {rows=rows, col=col, val=val, op=op, mu= mu,
kids=_kids(goal,klass,rows,col1,val1)} end
function _split(goal,klass, rows, all,good,best,col,val,n,p,s,mu)
all, good, mu = _tally(goal,klass, rows,{},{})
best = -1
for c, vals in pairs(good) do
for v, g in pairs(vals) do
n = all[c][v]
p = g/n
s = p*p / (p + (n-g)/n + 1e-32)
if s > best then best, col, val = s, c, v end end end
return col, val, mu end
function _tally(goal,klass,rows,all,good, y)
y = 0
for _, row in pairs(rows) do
y = y + row[klass]
for c, v in pairs(row) do
if c ~= klass and v ~= "?" then
inc2(all, c, v)
if row[klass] <= goal then inc2(good, c, v) end end end end
return all, good, y/#rows end
function _kids(goal,klass,rows,col1,val1, out)
out = {}
for op, t in pairs(_div(col1, val1, rows)) do
if #t >= the.leaf and #t < #rows then
out[op] = tree(goal,klass, t, col1, val1, op) end end
return out end
function _div(c,v,rows, yes,no)
yes, no = {}, {}
for _, r in pairs(rows) do push((r[c]=="?" or r[c]==v) and yes or no, r) end
return {["=="] = yes, ["!="] = no} end
----------------------------------------------------------------
function show(t, lvl)
print(string.format("%s%s (n=%d)", ("|.. "):rep(lvl or 0),
t.col and (t.col.." "..t.op.." "..t.val) or "", #t.rows))
for _, kid in pairs(t.kids or {}) do
show(kid, (lvl or 0) + 1) end end
function read( rows,klass)
rows = {}
for row in csv() do
if not klass then
print(row)
for i, s in pairs(row) do if s:find"-$" then klass = i end end
else rows[#rows + 1] = row end end
table.sort(rows, function(t,u) return t[klass] < u[klass] end)
return rows[math.floor((#rows)^.5)][klass], klass, rows end
-------------------------------------------------------------------------------
function push(t,x) t[#t + 1] = x; return x end
function coerce(s) return tonumber(s) or s end
function cells(s, t)
t = {}; for s1 in s:gmatch"([^,]+)" do t[#t + 1] = coerce(s1) end
return t end
function csv()
return function() local s = io.read(); return s and cells(s) end end
function inc2(t,c,v)
t[c]=t[c] or {}; t[c][v]=(t[c][v] or 0)+1; return t[c][v] end
function cli()
for i, s in pairs(arg or {}) do
if the[s:sub(2)] then the[s:sub(2)] = coerce(arg[i+1]) end end end
-------------------------------------------------------------------------------
if ... == nil then
cli()
local goal,klass,rows = read()
show(tree(goal, klass, rows)) end
return {tree=tree, show=show, read=read}
"""
def mad(rows):
a = [y(row) for row in rows]
median = sorted(a)[len(a) // 2]
return sorted([abs(x - median) for x in a])[len(a) // 2]
def mads(a,b): n1,n2=len(a),len(b); return (n1*mad(a) + n2*mad(b))/(n1+n2)
def key(rows,col,klass):
x = lambda row:row[col]
y = lambda row:row[klass]
rows = sorted([row in rows if x(row) != "?"]), key=x)
most = mad(rows)
b4,min = [],[],-1,len(rows)**.5
for i in range(len(rows)):
b = int(BINS/(1+e^(-1.704*(mu - x(rows[i])/sd))
left += [row]
if b != b4 and i>min and len(rows) - i > min:
mad1,mad2 = mads(left),mads(rows[i:])
if now < most
=
b4 = b
for row in sorted(rows, key=lambda row
function span(t,i, j, u)
u,j = {},j or #t; for k=i,j do u[1+#u] = t[k] end; return u end
function cut(rows,col,klass)
x = function(r) return r[col] end
y = function(r) return r[klass] end
lt = function(r1,r2) return x(r1) < x(r2) end
t = {}; for _,r in pairs(rows) if x(r) != "?" then t[1+#t]=r end end
table.sort(t,lt)
most = sd(t,y)
for i=1,#t do
j=#t-i
b = (the.BINS/(1+math.exp(-1.7404*math.exp((mu- x(t[i]))/sd))))//2
left[1 + #left] = rows[i]
if b ~= b4 and i>min and j > min:
sd1,sd2 = sd(span(rows,1,i),y), sd(span(rows,i+1))) / (i+j)
now = (i * sd1 + j * sd2) / (i+j)
if now < least then
out={now, {"<=",col,x(rows[i])}, {">",col,x(rows[i+1])}
function sd(row,fn, v,d,n,mu,m2)
n,mu,m2 = 0,0,0
for i=1,#t do
v = fn(t[i])
if v~="?" then
n,d = n+1, v-mu
mu = mu+d/n; m2 = m2+d*(x-mu) end end
return n<2 and 0 or (m2/(n-1))^.5 or 0 end
import fileinput,random,sys,re
from math import exp,min
from bisect import insort, bisect_left
class o(dict): __getattr__=dict.__getitem__; __setattr__=dict.__setitem__
the = o(BINS=10,LEAF=20,FILE=None,HEIGHT=3)
# misc details
def is_num(s) : return s[0].isupper()
def is_x(s) : return s[-1] not in "+-!X"
def is_goal(s) : return {"+":1,"-":0}.get(s[-1])
def selects(split, row):
if (z := row[split.col]) == "?": return True
if split.op == "<" : return z < split.val
if split.op == "<=": return z <= split.val
if split.op == "==": return z == split.val
if split.op == ">=": return z >= split.val
if split.op == ">" : return z > split.val
## Types are UPPERCASE(). Used only in constructor.
def SPLIT(op, col, val): return o(op=op, col=col, val=val)
def DATA(rows, y, names): return o(rows=rows, y=y, names=names)
def TREE(rows, split, mid, kids): return o(rows=rows, split=split, mid=mid, kids=kids)
def STATS(n, mid, div): return o(n=n, mid=mid, div=div)
# Constructors have Leadingcapitals() and return TYPES
Split = SPLIT # simple constructors are types
def Stats(rows, i):
return STATS(*_stats(sorted(r[i] for r in rows if r[i]!="?")))
def _stats(s):
n = len(s)
if n>= 4: mid,div = s[n//2], s[3*n//4] - s[n//4]
else: mid,div = (s[-1] - s[0])/2, s[-1] - s[0]
return n, mid, div / 1.349
def Data(things):
names, ys, rows, lo, hi = None, [], {}, {}, {}
for i, row in enumerate(things):
if i == 0:
names = row
ys = {i:w for i,name in enumerate(names) if (w:=is_goal(name))}
else:
rows += [row]
for j, name in enumerate(names):
if is_num(name) and row[j] != "?":
lo[j] = min(lo.get(j, 1e32), row[j])
hi[j] = max(hi.get(j, -1e32), row[j])
def norm(i,n): return (n - lo(i)) / (hi[i] - lo[i] + 1e-32)
def Y(i,n) : return sqrt(sum((norm(i,n) - w)**2 for i,w in ys.items()))
return DATA(rows, Y, names)
def Tree(data, split=None, lvl=0):
return TREE(data.rows, split,
Stats(data.rows, data.y).mid,
list(_kids(data, _splits(data), lvl)))
#-----------------------------------------------------------------------------
def _kids(data, splts, lvl):
if splts and lvl < the.HEIGHT:
for split in splts:
subset = [row for row in data.rows if selects(split, row)]
if the.LEAF <= len(subset) < len(data.rows):
yield Tree(Data(subset, data.y, data.names), split, lvl+1)
def _splits(data):
least, out = 1e32, []
for col, name in enumerate(data.names):
if is_x(name):
if xys := sorted((r[col],data.y(r)) for r in data.rows if r[col] != "?")
tmp,splts = (_numSplits if is_num(name) else _symSplits)(xys, col)
if splts and tmp < least:
least,out = tmp,splts
return out
def _symSplits(xys,col):
d = {}
for x,y in xys:
d[x] = d.get(x, [])
d[x] += [y]
return sum(len(a)/len(rows) * _stats(sorted(a)).div [-1] for a in d.values()), \
[Split("==", col, v) for v in d]
#-for get insort etc. get all theys,reviersed. appedl, push. then only resort wwn x~=b4
def _numSplits(xys,col):
n, b4, best, cuts = len(rows), xys[0][0], 1e32, []
lo, hi = [], sorted(y for _,y in xys)
for i, (x,y) in enumerate(xys):
insort(lo, hi.pop( bisect_left(hi,y))) # move y hi ==> lo
if x != b4:
if best > (xpect := (i*_stats(lo)[-1] + (n-i)*_stats(hi)[-1]) / n):
best, cuts = xpect, [Split("<", col, x), Split(">=", col, x)]
b4 = x
return best, cuts
def show(tree, lvl=0):
t = tree
cond = f"{t.split.col} {t.split.op} {t.split.val} " if t.split else ""
print(f"{'|.. ' * lvl}{cond} (w= {t.mid}, n={len(t.rows)})")
[show(kid, lvl+1) for kid in t.kids]
def leaf(tree, row):
for k in tree.kids:
if selects(k.split, row): return leaf(k, row)
return tree
#------------------------------------------------------------------------------
def coerce(x):
for f in (int,float):
try: return f(x)
except: pass
return x
def csv(path):
with fileinput.input(files=path or (), encoding="utf-8") as f:
for s in f:
if (s := re.sub(r"\s+|#.*", "", s)):
yield [coerce(x) for x in re.split(",", s)]
def _discretize(col, rows, i):
s = Stats(rows, i)
lo, bin = {}, lambda z: int(the.BINS/(1 + exp(-1.7 * (z - s.mid)/s.div)))
for n in col:
if n != "?":
lo[bin(n)] = min(lo.get(bin(n), 1e32), n)
return [lo[bin(n)] if n != "?" else "?" for n in col]
def discretized(data):
new = []
for i, (name, col) in enumerate(zip(data.names, zip(*data.rows))):
if is_num(name):
col = _discretize(col, data.rows, i)
new += [col]
return DATA(list(zip(*new)), data.y, data.names)
#--------------------------------------------------------------------
if __name__ == "__main__":
if len(sys.argv) > 1:
the.FILE = sys.argv[1]
data = read(csv(the.FILE))
disc = discretized(data)
for row in disc.rows[:10]:
print(row)
.\" Manpage for two.lua
.\" Contact [email protected]
.TH TWO.LUA 1 "November 29, 2025" "1.0" "User Commands"
.SH NAME
two.lua \- stochastic incremental Explainable AI (XAI) tool
.SH SYNOPSIS
.B ./two.lua
.RI [ OPTIONS ]
.RI [ COMMANDS ]
.SH DESCRIPTION
.B two.lua
is a Lua script that implements a stochastic incremental Explainable AI (XAI) algorithm. It processes data from CSV files to cluster data into "best" and "rest" sets, calculates distances between rows using weighted Euclidean distance (for attributes) or normalized distance to goals (for class variables), and incrementally updates a model.
.PP
The script handles both numeric and symbolic data types, supports normalization, and includes a test suite for demonstrating various internal statistical functions.
.SH OPTIONS
These options configure the hyperparameters and inputs for the algorithm. They must be provided as key-value pairs or flags.
.TP
.B \-h
Show the help message and exit.
.TP
.BI \-b " bins"
Set the number of bins for discretization.
.IP
Default: 7
.TP
.BI \-e " era"
Set the frequency (number of rows) at which the model updates.
.IP
Default: 30
.TP
.BI \-f " file"
Specify the path to the input CSV file. The file should contain a header row. Columns starting with uppercase letters are treated as Numerics, others as Symbols. Columns ending in '+' or '-' are treated as optimization goals (maximize/minimize). Columns ending in 'X' are ignored.
.IP
Default: ../lua6/auto93.csv
.TP
.BI \-r " ruleMax"
Set the maximum number of conditions allowed in a rule.
.IP
Default: 3
.TP
.BI \-s " seed"
Set the random number generator seed for reproducibility.
.IP
Default: 42
.SH COMMANDS
The script includes several built-in test hooks and execution modes. These arguments trigger specific functions within the script.
.TP
.B --csv
Read the input file defined by \fB-f\fR and print every parsed row to standard output.
.TP
.B --cut
Demonstrate the list splitting (cutting) function.
.TP
.B --data
Load the data file and display summary statistics for the independent variables (X columns).
.TP
.B --distx
Calculate and display distance statistics between rows based on independent variables (X).
.TP
.B --disty
Calculate and display distance statistics based on dependent optimization goals (Y).
.TP
.B --inc
Demonstrate incremental data updates (adding and removing rows from the statistical model).
.TP
.B --mode
Demonstrate the mode (most frequent item) calculation.
.TP
.B --num
Demonstrate Gaussian random number generation and summary statistics (mean, standard deviation).
.TP
.B --shuffle
Demonstrate the array shuffling function.
.TP
.B --the
Print the current configuration settings (the `the` table).
.TP
.B --two
Run the main stochastic incremental XAI clustering algorithm. This splits the data into training/testing sets, performs clustering, and outputs model performance metrics.
.SH EXAMPLES
.B Run the main XAI algorithm with default settings:
.PP
.nf
./two.lua --two
.fi
.PP
.B Process a custom dataset with a specific random seed:
.PP
.nf
./two.lua -f data/weather.csv -s 101 --two
.fi
.PP
.B Inspect the columns of a dataset:
.PP
.nf
./two.lua -f ../lua6/auto93.csv --data
.fi
.SH EXIT STATUS
Returns 0 on successful execution.
.SH AUTHOR
Tim Menzies ([email protected])
.SH LICENSE
MIT License (mit-license.org)
.SH COPYRIGHT
Copyright (c) 2025 Tim Menzies.
#!/usr/bin/env lua
-- __
-- /\ \__
-- \ \ ,_\ __ __ __ ___
-- \ \ \/ /\ \/\ \/\ \ / __`\
-- \ \ \_ \ \ \_/ \_/ \/\ \L\ \
-- \ \__\ \ \___x___/'\ \____/
-- \/__/ \/__//__/ \/___/
local help = [[
two.lua : stochastic incremental XAI
(c) 2025, Tim Menzies, [email protected], mit-license.org
Options:
-h Show help.
-b bins=7 Number of bins for discretization.
-e era=30 Update model every `era` number of rows.
-r ruleMax=3 Max conditions in a rule.
-s seed=42 Random number seed.
-f file=../lua6/auto93.csv ]]
-- coerce(s) --> v ;; Return int or float or bool or string from `s`.
local function coerce(s)
if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end
math.randomseed(the.seed)
local DATA, NUM, SYM, COLS, clone, adds
-- | o |_
-- | | |_)
local abs,exp,sqrt,log = math.abs, math.exp, math.sqrt, math.log
local max,rand,cos = math.max, math.random, math.cos
local say=io.write
local fmt = string.format
-- sort(t,f) --> t ;; Sort `t` using function `f`.
local sort = function(t,f) table.sort(t,f); return t end
-- lt(f) --> f ;; Return a function that sorts `a` and `b` on `f`.
local lt = function(f) return function(a,b) return f(a) < f(b) end end
-- cat(a) --> s ;; Return a string representation of array `a`.
local cat = function(a) return "{".. table.concat(a," ") .."}" end
-- o(v) --> s ;; Return a string representation of `v`.
local function o(v, list,dict)
list = function(a, u)
for _,v in ipairs(a) do u[1+#u] = o(v) end; return cat(u) end
dict = function(d, u)
for k,v in pairs(d) do u[1+#u] = fmt(":%s %s", k, o(v)) end
return cat(sort(u)) end
return type(v) == "number" and fmt(v%1==0 and "%.0f" or "%.3f", v) or
type(v) ~= "table" and tostring(v) or (#v>0 and list or dict)(v,{}) end
-- s2a(s) --> a ;; Return array of words from string `s`, split on ",".
local function s2a(s, a)
a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end
-- csv(file) --> f ;; Iterator that returns rows from `file`.
local function csv(file, src)
src = assert(io.open(file))
return function( s)
s = src:read(); if s then return s2a(s) else src:close() end end end
-- shuffle(t) --> t ;; Randomly shuffle the order of elements in `t`.
local shuffle = function(t, n)
for m=#t,2,-1 do n=math.random(m); t[m],t[n]=t[n],t[m] end; return t end
-- cut(a0,n,data) --> t,t ;;Split `a0` at `n` (if `data` exists,split that too).
local function cut(a0,n, data)
local a1,a2 = {},{}
for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end
return data and clone(data,a1),clone(data,a2) or a1,a2 end
-- mode(d) --> v ;; Return the most frequent key in `d`.
local function mode(d, v,n)
v,n = nil,0
for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end
return v end
-- box_muller(mu,sd) --> n ;; Return a random number from a Gaussian `mu`,`sd`.
function box_muller(mu,sd)
return mu + sd * sqrt(-2 * log(rand())) * cos(6.28 * rand()) end
-- _ | _. _ _ _ _
-- (_ | (_| _> _> (/_ _>
-- DATA(src) --> DATA ;; Create a new DATA, populated with `src`.
function DATA( src) return adds(src, {n=0,rows={},cols=nil}) end
-- clone(i,src) --> DATA ;; Return a new DATA with same structure as `i`.
function clone(i, src) return adds(src, DATA{i.cols.names}) end
-- NUM(at,s) --> NUM ;; Create a NUM object to summarize numbers.
function NUM(at,s)
return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0,
best=(tostring(s) or ""):find"+$" and 1 or 0} end
-- SYM(at,s) --> SYM ;; Create a SYM object to summarize symbols.
function SYM(at,s) return {at=at, of=s, n=0, has={}} end
-- COLS(row) --> COLS ;; Create a COLS object from a list of column names.
function COLS(row, t,x,y,all,col)
x,y,all = {},{},{}
for n,s in ipairs(row) do
col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
all[n] = col
if not s:match"X$" then
t = s:find"[+-]$" and y or x
t[1+#t] = col end end
return {all=all, x=x, y=y, names=row} end
-- ._ _ _ _|_ |_ _ _| _
-- | | | (/_ |_ | | (_) (_| _>
-- add(i,v,inc) --> v ;; Update `i` with `v` (incrementing by `inc`).
local function add(i,v, inc)
if v == "?" then return v end
inc = inc or 1
i.n = i.n + inc
if i.has then i.has[v] = inc + (i.has[v] or 0)
elseif i.mu then
if inc < 0 and i.n < 2 then i.sd, i.m2, i.mu, i.n = 0,0,0,0 else
local d = v - i.mu
i.mu = i.mu + inc * d / i.n
i.m2 = i.m2 + inc * d * (v - i.mu)
i.sd = i.n<2 and 0 or sqrt((max(0,i.m2)/(i.n - 1))) end
elseif i.rows then
if not i.cols then i.cols = COLS(v) else
i._mid = nil
for _,col in pairs(i.cols.all) do add(col, v[col.at], inc) end
if inc > 0 then i.rows[1 + #i.rows] = v end end end
return v end
-- sub(i,v) --> v ;; Decrement `v` from `i`.
local function sub(i,v) return add(i,v,-1) end
-- adds(src,it) --> it ;; Update `it` with all items from `src`.
function adds(src, it)
it = it or NUM()
if type(src) == "string"
then for row in csv(src) do add(it,row) end
else for _,row in pairs(src or {}) do add(it,row) end end
return it end
-- mid(i) --> v|row ;; Return central tendency of `i`.
local function mid(i)--> a | v;; Exepcted value for `i`.
if i.mu then return i.mu
elseif i.has then return mode(i.has)
elseif i.rows then
if not i._mid then
local t={}; for _,col in pairs(i.cols.all) do t[1+#t] = mid(col) end
i._mid = t end
return i._mid end end
-- norm(i,v) --> n ;; Normalize `v` 0..1 using `i`.
local function norm(i,v)
return (i.has or v=="?") and v
or 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end
-- aha(col,v1,v2) --> n ;; Return distance between `v1` and `v2`.
local function aha(col,v1,v2)
if v1=="?" and v2=="?" then return 1 end
if col.has then return v1==v2 and 0 or 1 end
v1,v2 = norm(col,v1), norm(col,v2)
v1 = v1 ~= "?" and v1 or (v2 > 0.5 and 0 or 1)
v2 = v2 ~= "?" and v2 or (v1 > 0.5 and 0 or 1)
return abs(v1 - v2) end
-- distx(i,row1,row2) --> n ;; Return distance `row1` to `row2` (using X cols).
local function distx(i,row1,row2, d)
d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
return sqrt(d/#i.cols.x) end
-- disty(i,row) --> n ;; Return distance of `row` to best goal (using Y cols).
local function disty(i,row, d)
d=0; for _,y in pairs(i.cols.y) do d= d + (norm(y, row[y.at]) - y.best)^2 end
return sqrt(d/#i.cols.y) end
-- distys(i,rows) --> rows ;; Sort `rows` by their distance to heaven.
local function distys(i, rows, y)
y = function(row) return disty(i, row) end
return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end
-- _|_ |_ o ._ |
-- |_ | | | | | |<
-- two(data) --> t ;; Incrementally cluster `data` into `best` and `rest`.
local function two(data)
local train,test,start,todo,seen,best,rest,d
shuffle(data.rows)
train,test = cut(data.rows, data.n//2)
start,todo = cut(train, 4)
seen = clone(data, start)
best,rest = cut(distys(seen),2,data)
d = function(row,what) return distx(seen, row, mid(what)) end
for n,row in pairs(todo) do
if n>256 then break end; --say(".")
if d(row,best) < d(row,rest) then
add(seen, add(best, row)) ; --say(best.n)
if best.n > sqrt(seen.n) then -- print("-")
add(rest, sub(best, table.remove( distys(best)))) end end end
distys(best)
return {best=best, rest=rest, seen=seen, test=test,
model = lt(function(row) return d(row,best) - d(row,rest) end)} end
-- _| _ ._ _ _ _
-- (_| (/_ | | | (_) _>
local egs={}
egs["-h"] = function(_) print("\n"..help.."\n") end
egs["-s"] = function(n) math.randomseed(n); the.seed =n end
egs["--the"] = function(_) print(o(the)) end
egs["--csv"] = function(_) for row in csv(the.file) do print(o(row)) end end
egs["--shuffle"] = function(_) print(o(shuffle{10,20,30,40,50})) end
egs["--mode"] = function(_) print(mode{d=2,f=10,g=1}) end
egs["--cut"] = function(_, b,c)
b,c=cut({10,20,30,40,50},2); print(o(b),o(c))
for _ =1,100 do b,c=cut({10,20,30,40,50},2) end end
egs["--num"] = function(_,num)
num=NUM()
for _=1,1000 do add(num, box_muller(10,5)) end
print(fmt("%.3f %.3f", num.mu, num.sd)) end
egs["--data"] = function(_)
for n,col in pairs(DATA(the.file).cols.x) do
print(n,o(col)) end end
egs["--distx"]= function(_, data,t,u)
data = DATA(the.file)
t,rows = {}, shuffle(data.rows)
for n = 2,#rows do t[1+#t] = distx(data,rows[n-1],rows[n]) end
print(o(sort(t))) end
egs["--disty"]= function(_, data,num)
data,t = DATA(the.file), {}
distys(data)
for n,row in pairs(data.rows) do t[n]=disty(data,row) end
print(o(t)) end
egs["--inc"] = function(_, data1,data2)
data1 = DATA(the.file)
print(o(mid(data1)))
data2 = clone(data1)
for _,row in pairs(data1.rows) do
add(data2,row)
if data2.n==50 then print(o(mid(data2))) end end
while data2.rows do
sub(data2, table.remove(data2.rows))
if data2.n==50 then print(o(mid(data2)));break end end end
egs["--two"] = function(_, data,out,t)
t,data = {}, DATA(the.file)
for _=1,20 do
out = two(data)
t[1+#t] = (100*disty(out.seen, sort(out.test, out.model)[1]))//1 end
print(o(sort(t))) end
-- cli(d,funs) --> nil ;; Update `d` with flags from command-line; run `funs`.
local function cli(d,funs)
for i,s in pairs(arg) do
if funs[s]
then funs[s](coerce(arg[i+1]))
else for k,_ in pairs(d) do
if k:sub(1,1)==s:sub(2) then d[k]=coerce(arg[i+1]) end end end end end
if arg[0]:find"two.lua" then cli(the,egs) end
.\" Manpage for two.lua data format
.\" Contact [email protected]
.TH TWO_DATA 5 "November 29, 2025" "1.0" "File Formats"
.SH NAME
two_data \- input CSV format for the two.lua XAI tool
.SH DESCRIPTION
The \fBtwo.lua\fR script processes data stored in comma-separated value (CSV) files. The format relies heavily on a structured header row to define variable types, optimization goals, and ignored columns.
.PP
The file must be plain text, with rows separated by newlines and columns separated by commas.
.SH HEADER SYNTAX
The first row of the file is crucial. It defines the schema. The naming convention of the column headers tells the script how to treat the data in that column.
.SS Data Types
.TP
.B Uppercase Start (e.g., \fIAge\fR, \fISalary\fR)
Columns where the header name starts with an uppercase letter are treated as \fBNUM\fR (Numeric). The script will calculate mean and standard deviation for these.
.TP
.B Lowercase Start (e.g., \fIjob\fR, \fIrace\fR)
Columns where the header name starts with a lowercase letter are treated as \fBSYM\fR (Symbolic). The script will calculate mode and entropy for these.
.SS Column Roles
.TP
.B Suffix '+' (e.g., \fIClass+\fR, \fIAcc+\fR)
Indicates a dependent variable (target) that should be \fBMAXIMIZED\fR. These columns constitute the "Y" (goal) variables.
.TP
.B Suffix '-' (e.g., \fILbs-\fR, \fICost-\fR)
Indicates a dependent variable (target) that should be \fBMINIMIZED\fR. These columns constitute the "Y" (goal) variables.
.TP
.B Suffix 'X' (e.g., \fIidX\fR, \fIDateX\fR)
Indicates a column that should be \fBIGNORED\fR entirely. These are often used for unique identifiers, comments, or raw data not suitable for clustering.
.TP
.B No Suffix
Any column without a '+', '-', or 'X' suffix is treated as an independent variable ("X" variable) used for clustering and distance calculations.
.SH DATA FORMAT
.TP
.B Numeric Values
Standard integer or floating-point numbers.
.TP
.B Symbolic Values
String identifiers. Note that the script splits strictly on commas, so strings containing commas may cause parsing errors.
.TP
.B Missing Values
Missing data should be represented by the question mark character (\fB?\fR). The script contains logic to handle these during distance calculations (assumed max distance).
.SH EXAMPLES
.PP
.B Example 1: A simple optimization dataset
.PP
In this example:
.br
- \fInameX\fR is ignored (ends in X).
.br
- \fIAge\fR is numeric independent (starts Upper).
.br
- \fIjob\fR is symbolic independent (starts lower).
.br
- \fISalary+\fR is a numeric goal to maximize.
.PP
.nf
nameX,Age,job,Salary+
1,25,engineer,50000
2,30,doctor,90000
3,?,artist,40000
.fi
.PP
.B Example 2: Multi-objective car selection
.PP
In this example:
.br
- \fIClndrs\fR, \fIVol\fR, \fIHp\fR are numeric inputs.
.br
- \fIorigin\fR is a symbolic input.
.br
- \fILbs-\fR is a goal to minimize (weight).
.br
- \fIAcc+\fR is a goal to maximize (acceleration).
.PP
.nf
Clndrs,Vol,Hp,origin,Lbs-,Acc+
8,304,193,1,4732,18.5
8,360,215,1,4615,14
4,97,52,3,2130,24.6
.fi
.SH SEE ALSO
.BR two.lua (1)

Zen of Bash

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

Bash is a powerful shell for automation and scripting. While famous for command-line one-liners, it's equally adept at building development environments, managing workflows, and creating portable init scripts.

Why Bash? Because it's everywhere. Pre-installed on virtually every Unix-like system, bash is the language that proves ubiquity is power. It works today, it worked 20 years ago, it will work 20 years from now.

Bash has its limits, sure: quirky syntax, easy to make subtle errors, limited data structures. That said, bash is superb for gluing tools together, managing environments, and creating scripts that just work across systems without dependencies.

25 Core Techniques

I. Script Initialization

1. The Shebang

First line declares the interpreter. Makes scripts executable.

#!/usr/bin/env bash

Use env to find bash in PATH rather than hardcoding /bin/bash.

2. Checking Dependencies

Verify required tools exist before running script.

WANT="git nvim gawk tree figlet"
for want in $WANT; do
  command -v "$want" &>/dev/null || echo "Warning: $want is NOT installed."
done

Uses command -v (POSIX-compliant) instead of which.

3. Silencing Warnings

Export variables to control environment behavior.

export BASH_SILENCE_DEPRECATION_WARNING=1

From ell. Prevents macOS from nagging about zsh.

4. Source vs Execute Detection

Script behaves differently when sourced vs executed.

if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  # Executed directly
  exec bash --init-file "${BASH_SOURCE[0]}" -i
fi

BASH_SOURCE[0] is script path, $0 is invocation name.

II. Working with Paths

5. Getting Script Directory

Reliably find where script lives, regardless of invocation.

Here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

From ell. Critical for scripts that load resources.

6. Modifying PATH

Add script directory to PATH for running local tools.

export PATH="$Here:$PATH"

Prepend (not append) so local versions override system ones.

III. Colors and Formatting

7. Terminal Colors with tput

Portable color codes using terminfo database.

bold=$(tput bold)
col0=$(tput sgr0)      # reset
col1=$(tput setaf 6)   # cyan
col2=$(tput setaf 3)   # yellow

Prefer tput over ANSI codes for portability.

8. Here Documents

Multi-line strings embedded in script.

hi() { 
  clear
  echo "${col1}"
  cat<<'EOF'
  _____ _      _      
 |  ___| |    | |     
 | |__ | |    | |     
EOF
  echo "${col0}"
}

Quote 'EOF' to prevent variable expansion in heredoc.

IV. Prompt Customization

9. Dynamic Prompt with Functions

Build prompt from function output evaluated on each display.

branch() { git branch 2>/dev/null | awk '/^\*/ {print $2}'; }
dirty() { [[ -n $(git status -s 2>/dev/null) ]] && echo "*"; }
PROMPT_COMMAND='PS1="${bold}${col1}$(basename "$PWD")${col0} ${col2}$(branch)$(dirty)${col0} ▶ "'

PROMPT_COMMAND runs before each prompt display.

10. Showing Parent and Current Directory

Compact path display showing context.

$(basename "$(dirname "$PWD")")/$(basename "$PWD")

Shows parent/current instead of full path.

V. Aliases

11. Basic Aliases

Short names for common commands.

alias Q='exit' 
alias l='ls -lh' 
alias la='ls -la'

Use single quotes to prevent early expansion.

12. Colorized Tool Defaults

Override commands with better defaults.

alias ls="\ls --color"
alias grep='grep --color=auto'

Backslash \ls prevents alias recursion.

13. Reload Function

Source script again to apply changes.

alias reload="source '$Here/ell' && echo ✅"

Double quotes allow $Here expansion at definition time.

VI. History Management

14. History Size

Store more commands for better recall.

export HISTSIZE=10000
export HISTFILESIZE=20000

HISTSIZE is in-memory, HISTFILESIZE is on-disk.

15. Deduplication

Remove duplicate commands from history.

export HISTCONTROL=ignoredups:erasedups

ignoredups = consecutive dups, erasedups = all dups.

16. History Options

Improve multi-line command handling.

shopt -s histappend  # Append, don't overwrite
shopt -s cmdhist     # Multi-line as one entry

shopt -s sets shell options.

VII. Functions

17. Simple Functions

Functions create reusable commands.

mkcd() { 
  mkdir -p "$1" && cd "$1"
}

&& ensures cd only if mkdir succeeds.

18. Wrapping External Tools

Create configured versions of existing commands.

vi() {
  nvim --clean \
    --cmd "set number relativenumber" \
    --cmd "set mouse=a clipboard=unnamedplus" \
    "$@"
}

"$@" passes all arguments to wrapped command.

19. Piping to Functions

Functions can process stdin.

plot() { 
  plot -p -e 'plot "-"'
}

The - tells gnuplot to read from stdin.

VIII. Conditional Execution

20. Double Bracket Tests

Modern bash conditionals with better syntax.

if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  exec bash --init-file "${BASH_SOURCE[0]}" -i
fi

[[ ]] supports ==, &&, ||, no quote requirements.

21. Testing for Non-Empty Strings

Check if command produced output.

[[ -n $(git status -s 2>/dev/null) ]] && echo "*"

-n tests for non-empty string.

IX. Redirection

22. Stderr Redirection

Send errors to /dev/null to suppress warnings.

command -v "$want" &>/dev/null || echo "Warning"

&> redirects both stdout and stderr.

23. Filtering Output

Pipe through grep to show only relevant lines.

check() {
  ruff check "$1" 2>&1 | grep -v "All checks passed"
  pyright "$1" 2>&1 | grep -E "^\s+.*:\d+:\d+"
}

2>&1 redirects stderr to stdout for piping.

X. Advanced Patterns

24. Temporary Directories

Create and auto-cleanup temp space.

_TMP=$(mktemp -d)
trap "rm -rf '$_TMP'" EXIT INT TERM

trap ensures cleanup on exit or interrupt.

25. Command Substitution in Prompts

Execute commands in prompt strings.

PS1="${bold}${col1}\$(basename \"\$PWD\")${col0}"

Escape $ with \$ to defer evaluation until prompt display.

Key Principles

  1. Use [[ ]] over [ ] - Modern syntax, fewer gotchas
  2. Quote everything - "$var" not $var (prevents word splitting)
  3. Prefer $() over backticks - Clearer, nestable
  4. Check exit codes - Use command || handle_error
  5. Use functions liberally - Easier to test and reuse
  6. Export what you need - export VAR=value for child processes
  7. Fail fast - Check dependencies at start
  8. Make scripts portable - Test on different systems
  9. Use tput for colors - More portable than ANSI codes
  10. Document with comments - Future you will thank present you

the zen of gawk: 40 core techniques

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

image

gawk (GNU AWK) is a powerful tool for text processing and data transformation. While famous for one-liners it's equally adept at building data pipelines and (as seen here) implementing machine learning classifiers.a

Why gawk (GNU AWK)? Because easy is not wrong. Ubiquitous and stable, gawk is the language that proves simplicity is not weakness. gawk has its limits, sure, such as only associative arrays for data structures; functions return only strings or numbers; untyped variables, interpreted only (no complier). That said, AWK is a superb language for testing ideas especially where the problem can be broken into chunks which can streamed as part of a pipe.

Learning Resources

Installation

brew install gawk       # macOS
winget install gawk     # windows
sudo apt install gawk   # Linux (Debian/Ubuntu)
sudo dnf install gawk   # Linux (Red Hat/Fedora)

Quick Command-Line Examples

# Print second column from CSV
gawk -F',' '{print $2}' data.csv

# Sum numbers in first column
gawk '{sum+=$1} END {print sum}' numbers.txt

# Count lines matching pattern
gawk '/error/ {count++} END {print count}' log.txt

# Print lines 10-20
gawk 'NR>=10 && NR<=20' file.txt

40 Core Techniques

1. Shebang Line

#!/usr/bin/env gawk -f

Makes script executable. Use chmod +x script.awk then run as ./script.awk data.txt

2. BEGIN Block

BEGIN { 
  FS=","; BINS=7; CONVFMT = "%.2f"
}

Runs once before processing any input. Set up variables, counters, constants. From bins script.

3. END Block

END { 
  report() 
}

Runs once after all input processed. Perfect for final reports, summaries. All three scripts use this.

4. Pattern-Action Blocks

NR==1 { head(); print }         # Runs only on first line
NR>1  { body() }                # Runs on all lines after first
/^[A-Z]/ { print }              # Runs when line starts with uppercase

If pattern matches, action executes. No pattern = always run. No action = print line.

5. Field Separator (FS)

BEGIN { FS="," }                # Split on comma (CSV)
BEGIN { FS=" " }                # Split on space (default)
BEGIN { FS="\t" }               # Split on tab (TSV)

Defines how AWK splits each line into fields. All three scripts use FS=",".

6. Built-in Variables (NF, NR)

NF                              # Number of Fields in current line
NR                              # Number of Records (line number)
for(i=1; i<=NF; i++)           # Loop through all fields
if (NR > WAIT+1)               # Skip first 20 lines (nbc)

7. Field References

$1                              # First field (actual class in nbc)
$i                              # Field at position i
$NF                             # Last field
$0                              # Entire line
print $2, $4                    # Print 2nd and 4th fields

In nbc: $1 == name checks if first field matches input.

8. Associative Arrays

cnt[i] = 5                      # Simple key-value
nk[actual]++                    # Count classes (nbc)
words["hello"] = 42             # String keys

No need to declare. Keys can be strings or numbers.

9. Multi-Dimensional Arrays

cf[kl]["tp"]++                  # 2D: confusion matrix (abcd)
freq[i][$i][actual]++           # 3D: column/value/class (nbc)
bmin[i][b] = v                  # 2D: bin minimums (bins)

AWK simulates multi-dimensional arrays using concatenated keys with SUBSEP.

10. String Functions: gsub

gsub(/[ \t\r]/,"")              # Remove whitespace globally
gsub("old", "new", var)         # Replace all in variable
sub("old", "new", var)          # Replace first occurrence only

All scripts use gsub(/[ \t\r]/,"") to clean input.

11. Regular Expressions

if ($i ~ /^[A-Z]/)              # Matches: starts with uppercase (bins)
if ($i !~ /[-+!]$/)             # Not match: doesn't end with -+! (bins)
/^fo+bar$/ { print }            # Pattern: one or more 'o'
$i ~ /[!]$/                     # Find class column (nbc)

12. User-Defined Functions

function seen(i,v) {
  cnt[i] += 1
  return v
}
function div(a,b) { 
  return int(100*a/(b+1E-32)) 
}

From abcd and bins. Define reusable logic.

13. Local Variables (The Hack)

function seen(i,v,    d) {      # d is local (note extra spaces)
  d = v - mu[i]                 # Welford's algorithm in bins
  mu[i] += d/cnt[i]
  return v
}

Everything is global except function parameters. Add extra params after whitespace for locals.

14. The in Operator

if (!(want in cf))              # Check if key exists (abcd)
  cf[want]["tn"] = total
if ("foo" in assoc)             # Test array membership
for (k in nk)                   # Iterate over keys

15. Piping Output

printf "..." | "sort -n"        # Pipe to shell command (abcd)
print data | "sort -r"          # Reverse sort

AWK can pipe output to any shell command.

16. Mathematical Functions

sqrt(m2[i]/(cnt[i]-1))          # Square root - std dev (bins)
log((nk[k] + K) / (NR-1))       # Natural log (nbc)
exp(-1.704 * (v - mu[i]))       # Exponential (bins)
int(BINS / (1 + ...))           # Integer conversion (bins)

17. Printf Formatting

printf "%5d %5d %5d", a, b, c   # Fixed-width integers (abcd)
printf "%.2f", value            # 2 decimal places
CONVFMT = "%.2f"                # Default conversion format (bins)

18. Ternary Operator

sd[i] = cnt[i] < 2 ? 0 : sqrt(...)  # Condition ? true : false (bins)
best = best ? best : k              # Set default value (nbc)

19. Increment Operators

cnt[i] += 1                     # Add and assign (bins)
nk[actual]++                    # Increment by 1 (nbc)
cf[kl]["tp"]+=(got==want)       # Add boolean result (abcd)

20. For Loops

for(i=1; i<=NF; i++)            # C-style: iterate fields
for(k in nk)                    # For-in: iterate keys (nbc)
for(r in row)                   # Iterate rows (bins)

21. Auto-Initialization

count[x]++                      # Uninitialized = 0
sum += value                    # No need for sum=0 first
nk[actual]++                    # Works immediately (nbc)

All variables start at 0 (numeric) or "" (string context).

22. Numeric Context Coercion

v += 0                          # Force string to number (bins)
if (v != "?")                   # String comparison

The += 0 idiom ensures numeric context.

23. Boolean as Integer

cf[kl]["tp"]+=(got==want)       # Boolean → 1 or 0 (abcd)
cf[kl]["fn"]+=(got!=want)       # False = 0, True = 1

Comparison operators return 0 or 1, usable in arithmetic.

24. String Concatenation

s = s sep bin(i, row[r][i])     # No operator needed (bins)
sep = ","                       # Just place strings adjacent
print "Hello" " " "World"       # → "Hello World"

25. Scientific Notation

1E32                            # Large number (infinity proxy)
1E-32                           # Small number (epsilon)
hi[i] = -(lo[i] = 1E32)         # Initialize bounds (bins)

26. Assignment Returns Value

hi[i] = -(lo[i] = 1E32)         # lo[i]=1E32 returns 1E32
if (n = split(...))             # Assign and test

Assignments are expressions that return the assigned value.

27. Guard Values (Division by Zero)

div(a, b+1E-32)                 # Avoid /0 (abcd)
sd[i] + 1E-32                   # Epsilon guard (bins)

28. Array Length

length(freq[i])                 # Number of keys (nbc)
length(nk)                      # Count classes (nbc)

Returns number of elements in associative array.

29. Short-Circuit Evaluation

if ((v!="?") && (i in hi))      # && stops if first is false (bins)
a || b                          # || stops if first is true

30. Dynamic Field Storage

row[NR-1][i] = seen(i,$i)       # Build custom structure (bins)

Store fields in your own arrays for later processing.

31. Pattern Without Action

{ gsub(/[ \t\r]/,"") }          # Applies to ALL lines

Implicit pattern (always true) modifies every line.

32. Multiple Statements Per Line

BEGIN { FS=","; BINS=7; CONVFMT = "%.2f" }

Semicolons separate statements on same line.

33. Implicit String/Number Conversion

"5" + 3                         # → 8 (string becomes number)
x = 5; print x ""               # → "5" (number becomes string)

AWK automatically converts based on context.

34. Delete Array Elements

delete ARGV[1]                  # Remove one element
delete array                    # Delete entire array

Free memory or remove unwanted elements.

35. Special Variables

ARGC                            # Number of command-line args
ARGV[n]                         # Command-line arguments
FILENAME                        # Current input filename
OFS                             # Output field separator

36. getline Variations

getline name < "/dev/stdin"     # Read from stdin (nbc example)
getline var < "file.txt"        # Read from file
"cmd" | getline var             # Read from command
getline                         # Read next line into $0

37. system() Command

system("echo foobar")           # Execute shell command
system("date")                  # Run external programs

Returns exit status of command.

38. Exponentiation

f = count ^ 2                   # Power operator
f ^= 2                          # Compound assignment

39. Case Conversion

tolower("HELLO")                # → "hello"
toupper("hello")                # → "HELLO"

Useful for case-insensitive comparisons.

40. String Functions (substr, match)

substr("foobar", 2, 3)          # → "oob" (start, length)
substr("foobar", 4)             # → "bar" (start to end)
match(str, /regex/)             # → position of match (or 0)
split("a:b:c", arr, ":")        # Split into array

Putting It Together

See bins, nbc, and abcd scripts for real-world examples using these techniques:

# Discretize then classify then evaluate
bins < diabetes.csv | nbc | abcd

Each script demonstrates multiple techniques working together to solve classification problems efficiently.

The zen of make: 20 core techniques

Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT

Introduction

image

make is a powerful tool for automating tasks. While famous for compiling code, it's just as useful for managing data science pipelines, documentation, and (as seen in this Gist) running script examples. It manages dependencies and executes commands, saving you from re-running entire workflows when only one part has changed.

This document uses the makefile from this Gist as its primary source of examples.

Learning Resources

Installation

make is pre-installed on virtually all Linux and macOS systems. You can check by running:

make --version

For Windows, make is available through tools like Windows Subsystem for Linux (WSL), Git Bash, or by installing it with a package manager like Chocolatey (choco install make).

Quick Command-Line Examples

# Run the default goal (in this case, 'help')
make

# Run the "test" recipe to execute all examples
make test

# Run a single, specific task
make eg-abcd

# "Dry run" - show commands without executing them
make -n eg-abcd

20 Core Techniques

I. The Basics: Rules and Recipes

1. The Core Rule: target: dependency and action

The fundamental syntax. make will ensure dependency is up-to-date before running the recipe for target.

tmp/out.log: data.csv
    cat data.csv | classify > tmp/out.log    # code to converts target to dependent

Note that the first time we run make tmp/out.log, the target is generated. And if we run it again, nothing happens unless $(Data) is updated.

2. Recipes

The indented shell commands to run for a target. Note: You must use a Tab, not spaces.

ok:
     chmod +x nbc abcd bins

3. Comments

Lines starting with # are ignored, perfect for documentation.

# vim: ts=2 sw=2 noet

4. The Default Goal

If you just type make, it runs the first target in the file. In this makefile, the first target is help.

help: ## show this help
	...

II. Targets and Dependencies

5. Phony Targets with .PHONY

Declares that a target does not create an actual file. This prevents make from getting confused if a file with the same name (e.g., test) exists.

.PHONY: help egs ok eg-nbc eg-abcd eg-soybean eg-globals pull push

6. Using Targets as "Scripts"

Phony targets act as convenient, memorable names for running complex commands.

test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now

7. Dependency Chains

Targets can depend on other targets, creating a chain of operations. Here, test depends on eg-nbc, which in turn depends on ok. make will automatically run them in the correct order: ok, then eg-nbc, then test.

test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now
eg-nbc: ok ## run naive bayes classifier
ok:; chmod +x nbc abcd bins

III. Variables

8. Simply Expanded Variables :=

The value is computed once at the point of definition and stored.

CYAN := \033[1;36m

9. Recursively Expanded Variables =

The value is a reference that is expanded every time the variable is used.

cdata=~/gits/timm/moot/classify/diabetes.csv

10. Using Variables

Reference variables using $(...) or ${...}.

hi=@echo -e "\n--------- $(YELLOW)$@$(RESET)"

11. Special Variable: SHELL

Specifies which shell to use for running recipes.

SHELL=/bin/bash

12. Special Variable: $(CURDIR)

An internal make variable that holds the path to the current working directory.

export PATH := $(CURDIR):$(PATH)

13. Automatic Variable: $@ (The Target)

Refers to the name of the target being executed. In the example, $@ will be eg-nbc when make eg-nbc is run.

hi=@echo -e "\n--------- $(YELLOW)$@$(RESET)"
eg-nbc: ok ## run naive bayes classifier
	$(hi); cat $(cdata) | bins | nbc | sort -n | uniq -c

14. Automatic Variable: $(MAKEFILE_LIST)

A list of all makefiles that were read. The help target cleverly uses this to read its own source code.

... 'BEGIN { FS=":.*## "; ... }' $(MAKEFILE_LIST)

IV. Recipes and Shell Commands

15. Silencing Commands with @

Prefixing a command with @ prevents make from printing the command to the console before running it.

@echo -e "\n$(CYAN)$$(cat bird.txt)$(RESET)\n"

16. Global Silencing with .SILENT:

A special target that tells make not to print any commands before running them. This is why you only see the output of the scripts in this Gist.

.SILENT:

17. Recursive make with $(MAKE)

It's common practice to use the $(MAKE) variable to call make from within make. This preserves settings and options.

eg-soybean: ## run classifier on soybean
	$(hi); $(MAKE) cdata=$(soybean) eg-abcd

18. Overriding Variables from CLI

You can change a variable's value from the command line. The eg-soybean target relies on this, calling eg-abcd but overriding the cdata variable for that one run.

# This is what the 'eg-soybean' target effectively runs:
make eg-abcd cdata=~/gits/timm/moot/classify/soybean.csv

V. Best Practices

19. Self-Documenting ## Convention

A common (but not built-in) convention. By putting ## after a target, you create a help comment that a help target can automatically parse and print.

test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now
eg-nbc: ok ## run naive bayes classifier
eg-abcd: ok ## run classifier with confusion matrix

20. The help Target

This gold-standard help target combines gawk, $(MAKEFILE_LIST), and the ## convention to automatically generate a help menu from the comments in the file itself.

CYAN   := \033[1;36m
YELLOW := \033[1;33m
GREEN  := \033[1;32m
RESET  := \033[0m

help: ## show this help
	gawk 'BEGIN { FS=":.*?## ";                                         \
			          printf "\n%smake%s [%soptions%s]:\n\n",                \
						           "$(CYAN)", "$(RESET)", "$(YELLOW)", "$(RESET)"}  \
		    NF==2 && $$1~/^[a-z0-9A-Z_-]+/                                   \
		          { printf "  %s%-15s%s %s\n",                                \
						           "$(GREEN)", $$1, "$(RESET)", $$2}' $(MAKEFILE_LIST)

This code parses a Makefile to generate a help menu:

image
@timm
Copy link
Author

timm commented Oct 31, 2025

Todo

Now

  • move to eras (line1= header, blank line= era++, each era tests on last era before updaging)
  • add config to make and strings liek G=gawk -v $(CONFIG) -f and OK=$G ok.awk -f
  • todo to zen of awk. add limits. interpretered. no data structures. functions cant return. strange,ly fast Regx and TCO (even python does. ot have that)

Later

  • best rest
  • tree generation
  • final chech cat data | era | bins | bestrest | tree # tree test = check=5

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment