Quick link = http://tiny.cc/litr
Src: abcd | bins | ell | makefile | nbc | ok.awk | tree.py
Doc: abcd.md | bins.md | nbc.md | zenOfBash.md | zenOfGawk.md | zenOfMake.md
Quick link = http://tiny.cc/litr
Src: abcd | bins | ell | makefile | nbc | ok.awk | tree.py
Doc: abcd.md | bins.md | nbc.md | zenOfBash.md | zenOfGawk.md | zenOfMake.md
| #!/usr/bin/env gawk -f | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| BEGIN { FS=" " } | |
| { seen($1,$2) } | |
| END { report() } | |
| function seen(want,got, kl) { | |
| if(!(want in cf)) { cf[want]["tn"] = total } | |
| if(!(got in cf)) { cf[got]["tn"] = total } | |
| total++ | |
| for(kl in cf) { | |
| if(kl==want) { cf[kl]["tp"]+=(got==want); cf[kl]["fn"]+=(got!=want) } | |
| else { cf[kl]["fp"]+=(got==kl); cf[kl]["tn"]+=(got!=kl) }}} | |
| function report( kl,a,b,c,d) { | |
| print " N TN FN FP TP PD PREC PF ACC LABEL" | |
| for(kl in cf) { | |
| a = cf[kl]["tn"]; b = cf[kl]["fn"] | |
| c = cf[kl]["fp"]; d = cf[kl]["tp"] | |
| printf "%5d %5d %5d %5d %5d %5d %5d %5d %5d %-20s\n", | |
| d+b, a, b, c, d, | |
| div(d,d+b), div(d,c+d), div(c,c+a), div(d+a,a+b+c+d), kl | "sort -n" }} | |
| function div(a,b) { return int(100*a/(b+1E-32)) } |
Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
chmod +x abcd && export PATH=$PATH:$(pwd) # install
someprogram data.txt | abcd # useactual predicted
actual predicted
etc
Uses diabetes.csv
make eg-abcd
--------- eg-abcd
N TN FN FP TP PD PREC PF ACC LABEL
255 383 83 110 172 67 60 22 74 tested_positive
493 172 110 83 383 77 82 32 74 tested_negative| #!/usr/bin/env gawk -f | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| BEGIN { FS=","} | |
| { gsub(/[ \t\r]/,"") } | |
| NR==1 { head(); print $0 ",Y-" } | |
| NR>1 { body() } | |
| END { tail() } | |
| function head( i) { | |
| for(i=1;i<=NF;i++) { | |
| if($i ~ /[\+\-]$/) { | |
| hi[i] = -(lo[i]=1e32) | |
| y[i] = $i ~ /\+$/ }}} | |
| function body( i,v) { | |
| for(i=1;i<=NF;i++) { | |
| if(i in y) { | |
| $i += 0 | |
| if ($i > hi[i]) hi[i] = $i | |
| if ($i < lo[i]) lo[i] = $i } | |
| row[NR-1][i] = $i }} | |
| function norm(i,x) { return (x - lo[i]) / (hi[i] - lo[i] + 1e-32) } | |
| function dist(row, d,n) { | |
| for(i in y) { d += (norm(i,row[i]) - y[i])^2; n++ } | |
| return (d / (n+1e-32))^.5 } | |
| function tail( r,i,s,sep,com) { | |
| for(r in row) { | |
| com = com ? com : "sort -t, -nk"(1+length(row[r])) | |
| s = sep="" | |
| for(i=1;i<=NF;i++) { | |
| s = s sep row[r][i]; sep = "," } | |
| print s "," dist(row[r]) | com }} |
| #!/usr/bin/env gawk -f | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| BEGIN { FS="," | |
| BINS=7; CONVFMT = "%.2f" } | |
| { gsub(/[ \t\r]/,"") } | |
| NR==1 { head(); print } | |
| NR>1 { body() } | |
| END { tail() } | |
| function head( i,b) { | |
| for(i=1;i<=NF;i++) | |
| if (($i ~ /^[A-Z]/) && ($i !~ /[-+!]$/)) { | |
| hi[i] = -(lo[i] = 1E32) | |
| for(b=0; b<=BINS; b++) bmin[i][b] = 1E32 }} | |
| function body( i) { | |
| for(i=1;i<=NF;i++) row[NR-1][i] = seen(i,$i) } | |
| function tail( r,i,s,sep) { | |
| for(r in row) for(i=1;i<=NF;i++) bin(i,row[r][i]) | |
| for(r in row) { | |
| s=sep="" | |
| for(i=1;i<=NF;i++) { | |
| s = s sep bin(i, row[r][i]); sep = "," } | |
| print s }} | |
| function seen(i,v, d) { | |
| if ((v!="?") && (i in hi)) { | |
| v += 0 | |
| cnt[i] += 1 | |
| d = v - mu[i] | |
| mu[i] += d/cnt[i] | |
| m2[i] += d*(v-mu[i]) | |
| sd[i] = cnt[i] < 2 ? 0 : sqrt(m2[i]/(cnt[i]-1)) } | |
| return v } | |
| function bin(i,v, b) { | |
| if ((v!="?") && (i in hi)) { | |
| b = int(BINS / (1 + exp(-1.704 * (v - mu[i]) / (sd[i] + 1E-32)))) | |
| if (v < bmin[i][b]) bmin[i][b] = v | |
| v = bmin[i][b] } | |
| return v } |
| #!/usr/bin/env lua | |
| local help = [[ | |
| bins.lua : stochastic incremental XAI | |
| (c) 2025, Tim Menzies, [email protected], mit-license.org | |
| Options: | |
| -h Show help. | |
| -b bins=7 Number of bins for discretization. | |
| -e era=30 Update model every `era` number of rows. | |
| -r ruleMax=3 Max conditions in a rule. | |
| -s seed=42 Random number seed. | |
| -f file=../lua6/auto93.csv ]] | |
| local function coerce(s) | |
| if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end | |
| local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end | |
| math.randomseed(the.seed) | |
| local DATA, NUM, SYM, COLS, clone, adds | |
| local abs,exp,sqrt,log = math.abs, math.exp, math.sqrt, math.log | |
| local max,rand,cos = math.max, math.random, math.cos | |
| local fmt = string.format | |
| local sort = function(t,f) table.sort(t,f); return t end | |
| local lt = function(f) return function(a,b) return f(a) < f(b) end end | |
| local cat = function(a) return "{".. table.concat(a," ") .."}" end | |
| local function o(v, list,dict) | |
| list = function(a, u) | |
| for _,v in ipairs(a) do u[1+#u] = o(v) end; return cat(u) end | |
| dict = function(d, u) | |
| for k,v in pairs(d) do u[1+#u] = fmt(":%s %s", k, o(v)) end | |
| return cat(sort(u)) end | |
| return type(v) == "number" and fmt(v%1==0 and "%.0f" or "%.3f", v) or | |
| type(v) ~= "table" and tostring(v) or (#v>0 and list or dict)(v,{}) end | |
| local function s2a(s, a) | |
| a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end | |
| local function csv(file, src) | |
| src = assert(io.open(file)) | |
| return function( s) | |
| s = src:read() | |
| if s then return s2a(s) else src:close() end end end | |
| local function cut(a0,n, data) | |
| local a1,a2 = {},{} | |
| for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end | |
| if data then return clone(data,a1),clone(data,a2) end | |
| return a1,a2 end | |
| function box_muller(mu,sd) | |
| return mu + sd * sqrt(-2 * log(rand())) * cos(6.28 * rand()) end | |
| -- ---------------------------------------------------------------------------- | |
| function DATA( src) return adds(src, {n=0,rows={},cols=nil}) end | |
| function clone(i, src) return adds(src, DATA{i.cols.names}) end | |
| function NUM(at,s) | |
| return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0, bins={}, | |
| best=(tostring(s) or ""):find"+$" and 1 or 0} end | |
| function SYM(at,s) return {at=at, of=s, n=0, has={}, bins={}} end | |
| function COLS(row, t,x,y,all,col) | |
| x,y,all = {},{},{} | |
| for n,s in ipairs(row) do | |
| col = (s:match"^[A-Z]" and NUM or SYM)(n,s) | |
| all[n] = col | |
| if not s:match"X$" then | |
| t = s:find"[+-]$" and y or x | |
| t[1+#t] = col end end | |
| return {all=all, x=x, y=y, names=row} end | |
| local function add(i,v) | |
| if v == "?" then return v end | |
| i.n = i.n + 1 | |
| if i.has then i.has[v] = 1 + (i.has[v] or 0) | |
| elseif i.mu then | |
| local d = v - i.mu | |
| i.mu = i.mu + d / i.n | |
| i.m2 = i.m2 + d * (v - i.mu) | |
| i.sd = i.n<2 and 0 or sqrt((i.m2/(i.n - 1))) | |
| elseif i.rows then | |
| if not i.cols then i.cols = COLS(v) else | |
| for _,col in pairs(i.cols.all) do add(col, v[col.at]) end | |
| i.rows[1 + #i.rows] = v end end | |
| return v end | |
| function adds(src, it) | |
| it = it or NUM() | |
| if type(src) == "string" | |
| then for row in csv(src) do add(it,row) end | |
| else for _,row in pairs(src or {}) do add(it,row) end end | |
| return it end | |
| local function norm(i,v) | |
| return (i.has or v=="?") and v | |
| or 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end | |
| local function disty(i,row, d) | |
| d=0; for _,y in pairs(i.cols.y) do d= d + (norm(y, row[y.at]) - y.best)^2 end | |
| return sqrt(d/#i.cols.y) end | |
| local function distys(i, rows, y) | |
| y = function(row) return disty(i, row) end | |
| return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end | |
| local function two(data) | |
| local train,test,start,todo,seen,best,rest,d | |
| shuffle(data.rows) | |
| train,test = cut(data.rows, data.n//2) | |
| start,todo = cut(train, 4) | |
| seen = clone(data, start) | |
| best,rest = cut(distys(seen),2,data) | |
| d = function(row,what) return distx(seen, row, mid(what)) end | |
| for n,row in pairs(todo) do | |
| if n>256 then break end | |
| if d(row,best) < d(row,rest) then | |
| add(seen, add(best, row)) | |
| if best.n > sqrt(seen.n) then | |
| add(rest, sub(best, table.remove( distys(best)))) end end end | |
| distys(best) | |
| return {best=best, rest=rest, seen=seen, test=test, | |
| model=function(row) return d(row,best) - d(row, rest) end} end | |
| -- ---------------------------------------------------------------------------- | |
| local egs={} | |
| egs["-h"] = function(_) print("\n"..help.."\n") end | |
| egs["-s"] = function(n) math.randomseed(n); the.seed =n end | |
| egs["--the"] = function(_) print(o(the)) end | |
| egs["--csv"] = function(_) for row in csv(the.file) do print(o(row)) end end | |
| egs["--num"] = function(_,num) | |
| num=NUM() | |
| for _=1,1000 do add(num, box_muller(10,5)) end | |
| print(fmt("%.3f %.3f", num.mu, num.sd)) end | |
| egs["--data"] = function(_) | |
| for n,col in pairs(DATA(the.file).cols.x) do | |
| print(n,o(col)) end end | |
| egs["--disty"]= function(_, data,num) | |
| data,t = DATA(the.file), {} | |
| distys(data) | |
| for n,row in pairs(data.rows) do t[n]=disty(data,row) end | |
| print(o(t)) end | |
| local function cli(d,funs) | |
| for i,s in pairs(arg) do | |
| if funs[s] | |
| then funs[s](coerce(arg[i+1])) | |
| else for k,_ in pairs(d) do | |
| if k:sub(1,1)==s:sub(2) then d[k]=coerce(arg[i+1]) end end end end end | |
| if arg[0]:find"bins.lua" then cli(the,egs) end |
Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
chmod +x bins && export PATH=$PATH:$(pwd) # install
bins < data.csv > discretized.csv # useCSV file where:
-, +, or !)? for missing values)bins < auto.csv
# Numeric columns divided into 7 bins (0-6)
# Each value replaced by its bin's minimum value-+! suffix)bin = BINS / (1 + exp(-1.704 * (v - mean) / stddev))Note: Uses Welford's algorithm for numerically stable mean/variance calculation. The constant 1.704 ensures the middle values fall into the middle bins.
| #!/usr/bin/env bash | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| alias ls="\ls --color" | |
| alias reload="source '$Here/ell' && echo ✅" | |
| alias grep='grep --color=auto' | |
| alias tree='tree -C' | |
| export BASH_SILENCE_DEPRECATION_WARNING=1 | |
| export PATH="$Here:$PATH" | |
| export HISTSIZE=10000 | |
| export HISTFILESIZE=20000 | |
| export HISTCONTROL=ignoredups:erasedups | |
| Here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| branch() { git branch 2>/dev/null | awk '/^\*/ {print $2}'; } | |
| dirty() { [[ -n $(git status -s 2>/dev/null) ]] && echo "*"; } | |
| bold=$(tput bold) col0=$(tput sgr0) col1=$(tput setaf 6) col2=$(tput setaf 3) | |
| PROMPT_COMMAND='PS1="${bold}${col1}$(basename "$(dirname "$PWD")")/$(basename "$PWD")${col0} ${col2}$(branch)$(dirty)${col0} ▶ "' | |
| vi() { | |
| nvim --clean \ | |
| --cmd "let g:netrw_banner=0 | let g:netrw_liststyle=3 | let g:netrw_browse_split=4 | let g:netrw_winsize=15" \ | |
| --cmd "set number relativenumber cursorline mouse=a clipboard=unnamedplus ignorecase smartcase" \ | |
| --cmd "set statusline=%#StatusLine#\ ▶\ %f\ %m%r%=%y\ ❖\ %l:%c\ ❖\ %p%%\ " \ | |
| --cmd "set expandtab tabstop=2 shiftwidth=2 splitright splitbelow" \ | |
| --cmd "set undofile undodir=~/.vim/undo" \ | |
| --cmd "nnoremap Q :quitall<CR>" \ | |
| --cmd "colorscheme zaibatsu" \ | |
| --cmd "set laststatus=2" \ | |
| "$@" | |
| } | |
| hi() { | |
| clear | |
| echo "${col1}" | |
| cat<<'EOF' | |
| ██╗ ██╗ ████████╗ ██████╗ | |
| ██║ ██║ ╚══██╔══╝ ██╔══██╗ | |
| ██║ ██║ ██║ ██████╔╝ | |
| ██║ ██║ ██║ ██╔══██╗ | |
| ███████╗ ██║ ██║ ██║ ██║ | |
| ╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ | |
| (cause simple ain't stupid) | |
| EOF | |
| echo "${col0}" | |
| } | |
| inst() { | |
| local m="" | |
| for p in $1; do command -v "$p" &>/dev/null || m+="$p "; done | |
| [ "$m" ] && case "$(uname -s)" in | |
| Darwin*) brew install $m ;; | |
| Linux*) sudo apt install -y $m ;; | |
| MINGW*) winget install $m ;; | |
| esac | |
| } | |
| # only run slow or verbose command at initial startup, not on reload | |
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | |
| inst git nvim gawk tree | |
| hi | |
| exec bash --init-file "${BASH_SOURCE[0]}" -i | |
| fi |
| #!/usr/bin/env gawk -f | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| BEGIN { SEED = 1234567891 | |
| ERA = 100 } | |
| NR==1 { srand(SEED) | |
| print "\n" $0; next } | |
| { a[rand()] = $0 | |
| if (length(a) > ERA) dump(a) } | |
| END { dump(a) } | |
| function dump(a, i) { | |
| if (length(a) > 0) { | |
| print "" | |
| for(i in a) print a[i] | |
| delete a }} |
| #!/usr/bin/env sh | |
| figlet -W -f mini $@ | gawk '{print "-- " $0}' |
| # Lua.ssh --- Sheet definitions for Lua source code | |
| # Copyright (c) 2014 Kenji Rikitake | |
| # Copyright (c) 1999 Edward Arthur, Akim Demaille, Miguel Santana | |
| # | |
| # | |
| # This file is NOT a part of a2ps. | |
| # | |
| # This program is free software; you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License as published by | |
| # the Free Software Foundation; either version 2, or (at your option) | |
| # any later version. | |
| # | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # GNU General Public License for more details. | |
| # | |
| # You should have received a copy of the GNU General Public License | |
| # along with this program; see the file COPYING. If not, write to | |
| # the Free Software Foundation, 59 Temple Place - Suite 330, | |
| # Boston, MA 02111-1307, USA. | |
| ## This style is derived from Edward Arthur's AWK Style Sheet | |
| style Lua is | |
| written by "Kenji Rikitake <[email protected]>" | |
| version is 0.1 | |
| requires a2ps version 4.9.7 | |
| documentation is | |
| "This style file is intended to support the Lua programming language source code." | |
| end documentation | |
| alphabets are | |
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_0" | |
| case sensitive | |
| keywords in Keyword are | |
| and, false, ipairs, nil, not, or, pairs, rawequal, rawget, | |
| rawlen, rawset, select, tonumber, tostring, true, type | |
| end keywords | |
| #keywords in Keyword_strong are | |
| keywords in Keyword_strong are | |
| assert, break, collectgarbage, do, dofile, else, elseif, "end", | |
| error, for, function, getmetatable, goto, if, "in", local, | |
| load, loadfile, next, pcall, print, repeat, require, return, | |
| setmetatable, then, until, while, xpcall, _G, _VERSION | |
| end keywords | |
| keywords in Label_strong are | |
| "^" function | |
| end keywords | |
| keywords in Comment are | |
| self | |
| end keywords | |
| sequences are | |
| "[[" Comment "]]", | |
| "--" Comment, | |
| C-string | |
| end sequences | |
| end style | |
| # vim: ts=2 sw=2 noet | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| #------------------------------------------------------ | |
| K ?= 1# | |
| M ?= 2# | |
| BINS ?= 4# | |
| ERA ?= 100# | |
| DATA = ~/gits/moot/classify/diabetes.csv# | |
| DATAG = ~/gits/moot/optimize/misc/auto93.csv# | |
| G = gawk -f# | |
| OK = gawk -f ok.awk -f# | |
| #------------------------------------------------------ | |
| SHELL=/bin/bash | |
| export PATH := $(CURDIR):$(PATH) | |
| .SILENT: | |
| .PHONY: help egs ok eg-nbc eg-abcd eg-soybean eg-globals pull push | |
| help: ## show this help | |
| @gawk 'BEGIN { FS=":.*?## ";c="\033[1;3"; r="\033[0m"; \ | |
| printf "\n%s6mmake%s [%s3moptions%s]:\n\n",c,r,c,r} \ | |
| NF==2 && $$1~/^[a-z0-9A-Z_-]+/{ \ | |
| printf " %s2m%-15s%s %s\n",c,$$1,r,$$2}' $(MAKEFILE_LIST) | |
| #------------------------------------------------------ | |
| hi=@echo -e "\n--------- $@" | |
| test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now | |
| ok:; chmod +x nbc abcd bins | |
| eg-bins: ok $(DATA) ## run naive bayes classifier | |
| cat $(DATA) | bins BINS=$(BINS) | |
| eg-nbc: ok $(DATA) ## run naive bayes classifier | |
| $(hi); cat $(DATA) | bins BINS=$(BINS) | nbc K=$K M=$M | sort | uniq -c | |
| eg-abcd: ok $(DATA) ## run classifier with confusion matrix | |
| $(hi); cat $(DATA) | bins BINS=$(BINS) \ | |
| | nbc K=$K M=$M #| abcd | |
| eg-soybean: ## run classifier on soybean | |
| $(hi); $(MAKE) eg-abcd DATA=~/gits/timm/moot/classify/soybean.csv | |
| eg-best: ok $(DATA) ## run naive bayes classifier | |
| cat $(DATAG) | bins BINS=$(BINS) | best | column -s, -t | sed -n '1,5p;395,$$p' | |
| eg-tree: ok $(DATA) ## run naive bayes classifier | |
| cat $(DATAG) | bins BINS=$(BINS) | best | tree | |
| eg-globals: ok $(DATA) ## run classifier with confusion matrix | |
| $(hi); ( cat $(DATA) | $(OK) bins BINS=$(BINS) \ | |
| | $(OK) nbc K=$K M=$M | $(OK) abcd ) > /dev/null | |
| # misc ------------------------------------------------ | |
| sh: ## run a customized shell | |
| sh ell | |
| pull: ## update from main | |
| git pull | |
| push: ## commit to main | |
| git commit -am saving; git push; git status | |
| ~/tmp/%.pdf: %.lua | |
| echo "pdf-ing $@ ... " | |
| a2ps \ | |
| --file-align=virtual \ | |
| --line-numbers=1 \ | |
| --pro=color \ | |
| --lines-per-page=120 \ | |
| --pretty=lua.ssh \ | |
| --left-title="" \ | |
| --borders=no \ | |
| --right-footer="page %s. of %s#" \ | |
| --landscape \ | |
| --columns 3 \ | |
| -M letter \ | |
| -o - $^ | ps2pdf - $@ | |
| open $@ |
| #!/usr/bin/env gawk -f | |
| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| BEGIN { FS=","; WAIT=20; M=2; K=1 } | |
| { gsub(/[ \t\r]/,"") } | |
| NR==1 { head() } | |
| NR>1 { if (NR > WAIT+1) print $klass, predict() | |
| train($klass) } | |
| function head( i) { | |
| for(i=1;i<=NF;i++) if ($i ~ /[!]$/) klass = i } | |
| function train(actual, i) { | |
| nk[actual]++ | |
| for(i=1;i<=NF;i++) | |
| if (i != klass) freq[i][$i][actual]++ } | |
| function predict( i,k,like,best,max) { | |
| max = -1E32 | |
| for(k in nk) { | |
| like = log((nk[k] + K) / (NR-1 + K*length(nk))) | |
| for(i=1;i<=NF;i++) | |
| if (i != klass) | |
| like += log((freq[i][$i][k] + M/length(freq[i])) / (nk[k] + M)); | |
| if (like > max) { | |
| max = like | |
| best = k }} | |
| return best ? best : k } |
Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
chmod +x nbc && export PATH=$PATH:$(pwd) # install
bins < data.csv | nbc # useCSV file where:
! is the class label)? for missing values)Uses diabetes.csv (binned)
make eg-nbc
--------- eg-nbc
383 tested_negative tested_negative
110 tested_negative tested_positive
83 tested_positive tested_negative
172 tested_positive tested_positiveNote: Requires discretized data (pipe through bins first). Output can be piped to abcd for metrics.
actual_class predicted_classlog P(class|features) = log P(class) + Σ log P(feature|class)| # Copyright (c) 2025 Tim Menzies, MIT License | |
| # https://opensource.org/licenses/MIT | |
| END { rogues() } | |
| function rogues( i,s,known) { | |
| known = "^(NF|NR|FS|RS|RT|FNR|OFS|ORS|" \ | |
| "PREC|ARGC|ARGV|OFMT|LINT|FPAT|" \ | |
| "ERRNO|RSTART|ARGIND|" \ | |
| "SUBSEP|CONVFMT|ENVIRON|SYMTAB|FUNCTAB|PROCINFO|" \ | |
| "FILENAME|RLENGTH|BINMODE|" \ | |
| "IGNORECASE|FIELDWIDTHS|ROUNDMODE|TEXTDOMAIN)$" | |
| for(i in SYMTAB) if (i !~ known) s = s " " i | |
| if (s) print "?" s > "/dev/stderr" } |
XXX data molde laiga. presentation layer seeprete to business model
XXX too many vars==> too many ideas XXX vars passed as a group to a subfunction ==> hidden objects
XXX yo yo is hatton's point (does oo syntax with the say we thing)
XXX test driven devlopment fig3 of https://ieeexplore.ieee.org/document/10352439 offers details on test
A comprehensive catalog of programming heuristics with examples from two.lua, Python, and larger systems.
These heuristics are represent some basic ideas about what is "good" SE.
But be careful how you use them. If you only use this knowledge to complain about 'bad code,' you become the bottleneck to other people' work. But if you use it to quietly refactor and fix the problems you see, you can earn a reputataion as a tech guru.
In two.lua:
local help = [[
-b bins=7 Number of bins for discretization.
-e era=30 Update model every `era` number of rows.
-r ruleMax=3 Max conditions in a rule.
-s seed=42 Random number seed.]]
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) endSingle source of truth - help string defines both documentation AND defaults.
Python example:
# Bad: Define settings twice
DEFAULT_TIMEOUT = 30
parser.add_argument('--timeout', default=30, help='Timeout in seconds (default: 30)')
# Good: Define once
DEFAULTS = {'timeout': 30, 'retries': 3}
for key, val in DEFAULTS.items():
parser.add_argument(f'--{key}', default=val, help=f'{key} (default: {val})')Bigger system example: Django's database models define the schema once, then auto-generate admin interfaces, forms, migrations, and API serializers from that single definition.
In two.lua:
col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
t = s:find"[+-]$" and y or xColumn names encode the schema - uppercase=numeric, +/-=goals, X=skip.
Python example:
# Encode validation rules in data
RULES = {
'email': r'^[\w\.-]+@[\w\.-]+\.\w+$',
'phone': r'^\d{3}-\d{3}-\d{4}$',
'zip': r'^\d{5}$'
}
def validate(field, value):
return re.match(RULES[field], value)Bigger system example:
Unix file permissions (rwxr-xr--) encode all access rules in 9 bits. The chmod program is simple because the data structure (permission bits) carries the knowledge.
In two.lua:
local function add(i,v, inc)
if v == "?" then return v end
inc = inc or 1
i.n = i.n + inc
if i.mode then i.has[v] = inc + (i.has[v] or 0)
elseif i.mu then ...
elseif i.rows then ...One function handles NUM, SYM, and DATA with minimal branching.
Python example:
def median(numbers):
sorted_nums = sorted(numbers)
n = len(sorted_nums)
mid = n // 2
return sorted_nums[mid] if n % 2 else (sorted_nums[mid-1] + sorted_nums[mid]) / 2Simple, direct algorithm. No complex edge case handling.
Bigger system example: Git's object model - everything is either a blob, tree, commit, or tag. Four types, composed simply, create an entire version control system.
In two.lua: Entire incremental XAI system in ~150 lines. Functions like:
local function cut(a0,n, data)
local a1,a2 = {},{}
for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end
if data then return clone(data,a1),clone(data,a2) end
return a1,a2 endPython example:
# Flask web server in 5 lines
from flask import Flask
app = Flask(__name__)
@app.route('/')
def hello(): return "Hello World!"
app.run()Bigger system example: SQLite is ~150K lines for a full SQL database. Compare to Oracle's millions of lines. Parsimony wins for embedded use.
In two.lua:
local lt = function(f) return function(a,b) return f(a) < f(b) end end
local cat = function(a) return "{".. table.concat(a," ") .."}" endNames tell you exactly what they do: lt makes comparators, cat concatenates.
Python example:
def is_palindrome(text):
cleaned = ''.join(c.lower() for c in text if c.isalnum())
return cleaned == cleaned[::-1]Name and implementation clearly express intent.
Bigger system example:
Go's error handling - if err != nil { return err } is verbose but crystal clear. No hidden control flow like exceptions.
In two.lua:
local shuffle = function(t, n)
for m=#t,2,-1 do n=math.random(m); t[m],t[n]=t[n],t[m] end; return t endSimple Fisher-Yates, not some optimized version. Programmer time > machine time.
Python example:
# Simple bubble sort for teaching
def bubble_sort(arr):
for i in range(len(arr)):
for j in range(len(arr)-1-i):
if arr[j] > arr[j+1]:
arr[j], arr[j+1] = arr[j+1], arr[j]Not efficient, but easy to understand and maintain.
Bigger system example:
Python's sorted() uses Timsort - it's complex internally, but the API is dead simple: sorted(items). Economy of interface, not implementation.
In two.lua:
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) endGenerate config parser from help string automatically.
Python example:
# Generate test cases from data
test_cases = [(i, i**2) for i in range(10)]
for input_val, expected in test_cases:
assert square(input_val) == expectedBigger system example: Protocol Buffers - define data schema once, generate serializers/deserializers for 20+ languages automatically.
In two.lua:
local function add(i,v, inc) ...
local function sub(i,v) return add(i,v,-1) endadd adds, sub subtracts. Does what the name says.
Python example:
class Stack:
def push(self, item): self.items.append(item)
def pop(self): return self.items.pop()
def peek(self): return self.items[-1]Method names are intuitive verbs that do exactly what you expect.
Bigger system example:
jQuery's $('.class').hide() - method names are verbs that do exactly what they say. No surprises.
In two.lua:
-- File I/O
local function csv(file, src)
src = assert(io.open(file))
return function( s)
s = src:read()
if s then return s2a(s) else src:close() end end end
-- String parsing
local function s2a(s, a)
a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end
-- Type conversion
local function coerce(s)
if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end endEach function handles one concern: I/O, parsing, or type conversion.
Python example:
# Separate data access, business logic, presentation
class UserRepository:
def get_user(self, id): return db.query(User).get(id)
class UserService:
def activate_user(self, user_id):
user = self.repo.get_user(user_id)
user.active = True
return user
class UserView:
def render(self, user): return f"<div>{user.name}</div>"Bigger system example: MVC pattern - Models handle data, Views handle presentation, Controllers handle user input. Each concern separated.
Important sub-case.
Yes. Replacing raw print statements is a best practice in software engineering. Relying on raw prints creates "noise" that is difficult to turn off when you move from development to production.
You generally have two paths to solve this: the Custom Wrapper (your "say" idea) or Standard Logging.
This is exactly what you proposed. It is a lightweight solution perfect for scripts, CLI tools, and small prototypes.
How it works: You wrap the print function in a conditional check based on a global flag.
# Configuration
VERBOSE = True # Toggle this single flag to silence the app
def say(message):
if VERBOSE:
print(f"[INFO] {message}")
# Usage
say("Fetching data...") # Prints only if VERBOSE is True
print("Fatal Error: 404") # Always prints (standard print for actual results)Pros:
say function later.If your project is an application, a server, or a library used by others, you should skip the "say" function and use a logging library (like Python's logging, Java's Log4j, or JS winston).
How it works: Instead of a binary on/off switch, you use Levels. This allows you to silence "chatter" while keeping "warnings" active.
import logging
# Configuration: Set to INFO to hide DEBUG messages
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
# Usage
logger.debug("Variable x = 5") # Silenced (Level is too low)
logger.info("Process started.") # Printed
logger.warning("Disk space low.") # Printed
logger.error("Database crashed!") # PrintedPros:
| Feature | Raw print() |
Custom say() Wrapper |
Standard logging |
|---|---|---|---|
| Control | None | High (Custom code) | High (Configuration) |
| Silencing | Manual deletion | Single Flag (All or Nothing) | Leveled (Debug vs Error) |
| Complexity | Low | Low | Medium |
| Best For | Throwaway code | Scripts & Tools | Production Apps |
print statements for any informational text.say: If you just need a "mute button," your proposed solution is perfect.logging later: If the project grows and you need to differentiate between "This is just info" and "This is a warning," switch to a logging library.In two.lua:
local function shuffle(t, n) -- Only shuffles
local function cut(a0,n, data) -- Only splits
local function norm(i,v) -- Only normalizesPython example:
def read_file(path):
with open(path) as f:
return f.read()
def parse_json(text):
return json.loads(text)
def validate_config(config):
assert 'version' in config
return config
# Each does one thing
config = validate_config(parse_json(read_file('config.json')))Bigger system example:
Unix utilities - grep only searches, sort only sorts, uniq only deduplicates. Compose them with pipes.
In two.lua:
local function add(i,v, inc)
if v == "?" then return v end
inc = inc or 1
i.n = i.n + inc
if i.mode then i.has[v] = inc + (i.has[v] or 0)
elseif i.mu then ...
elseif i.rows then ...Open for extension (new types via new fields), closed for modification (don't change add).
Python example:
class Shape:
def area(self): raise NotImplementedError
class Circle(Shape):
def __init__(self, r): self.r = r
def area(self): return 3.14 * self.r ** 2
class Square(Shape):
def __init__(self, s): self.s = s
def area(self): return self.s ** 2
# Can add new shapes without modifying existing code
shapes = [Circle(5), Square(4)]
total_area = sum(s.area() for s in shapes)Bigger system example: Linux kernel modules - add new device drivers without modifying kernel core. Open for extension, closed for modification.
In two.lua:
function DATA( src) return adds(src, {n=0,rows={},cols=nil}) end
function COLS(row, t,x,y,all,col)
x,y,all = {},{},{}
for n,s in ipairs(row) do
col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
all[n] = col ...DATA has COLS, COLS has array of NUM/SYM. Tables composed, not inherited.
Python example:
class Engine:
def start(self): return "Engine running"
class Wheels:
def roll(self): return "Rolling"
class Car:
def __init__(self):
self.engine = Engine()
self.wheels = Wheels()
def drive(self):
return f"{self.engine.start()}, {self.wheels.roll()}"Car composed of Engine and Wheels, not inheriting from them.
Bigger system example: React components - compose small components into larger ones. No deep inheritance hierarchies, just composition.
In two.lua:
local function mid(i)
if i.mu then return i.mu
elseif i.has then return mode(i.has)
elseif i.rows then
i._mid = i._mid or mode(i.has)
return i._mid end endSame interface mid(i) works for NUM, SYM, or DATA.
Python example:
class Temperature:
def __init__(self, celsius):
self._celsius = celsius
@property
def fahrenheit(self):
return self._celsius * 9/5 + 32
t = Temperature(100)
print(t.fahrenheit) # Accessed like attribute, computed like methodBigger system example:
Ruby - array.length and string.length use same interface, don't care if it's a method or property.
In two.lua:
-- Commands (modify state, but also return for chaining)
local function add(i,v, inc) ... end
-- Queries (just read, never modify)
local function mid(i) ...
local function distx(i,row1,row2) ... Python example:
class Stack:
# Command - modifies state
def push(self, item):
self.items.append(item)
# Query - just reads
def size(self):
return len(self.items)
# Command that also returns (useful for chaining)
def pop(self):
return self.items.pop()Bigger system example:
SQL - SELECT queries don't modify data, INSERT/UPDATE/DELETE commands do. Clear separation.
In two.lua:
src = assert(io.open(file))Die immediately if file missing, don't propagate errors everywhere.
Python example:
def divide(a, b):
assert b != 0, "Cannot divide by zero"
return a / b
def process_user(user):
assert user is not None, "User cannot be None"
assert user.email, "User must have email"
return send_email(user.email)Bigger system example:
Rust's unwrap() - panic immediately on None/Err rather than silently propagating null through the system.
In two.lua:
local function add(i,v, inc)
...
if i.mode then i.has[v] = inc + (i.has[v] or 0)
elseif i.mu then ..."If it has .mu, treat it as NUM; if it has .mode, treat it as SYM"
Python example:
def process(file_like):
data = file_like.read()
file_like.close()
return data
# Works with real files, StringIO, network sockets, etc.
from io import StringIO
process(open('data.txt'))
process(StringIO("hello"))Bigger system example:
Python's file-like objects - anything with .read(), .write(), .close() can be treated as a file. StringIO, network sockets, HTTP responses all work the same.
In two.lua:
local function aha(col,v1,v2)
if v1=="?" and v2=="?" then return 1 end
if col.mode then return v1==v2 and 0 or 1 end
v1 = v1 ~= "?" and v1 or (v2 > 0.5 and 0 or 1)
v2 = v2 ~= "?" and v2 or (v1 > 0.5 and 0 or 1)
return math.abs(v1 - v2) endLiberal in what you accept - handles missing values gracefully.
Python example:
def safe_int(value, default=0):
"""Accept strings, floats, None - be liberal"""
if value is None:
return default
try:
return int(value)
except (ValueError, TypeError):
return default
safe_int("42") # 42
safe_int("hello") # 0
safe_int(None) # 0
safe_int(3.14) # 3Bigger system example: HTML parsers - browsers accept malformed HTML and do their best to render it. Conservative in output (valid HTML), liberal in input (broken HTML).
In two.lua:
return function(row) return d(row,best) - d(row, rest) endtwo() returns a function, not exposing internal best/rest structures.
Python example:
# Bad: reaches through multiple objects
customer.wallet.money.subtract(price)
# Good: ask, don't reach
customer.pay(price)
class Customer:
def pay(self, amount):
self.wallet.deduct(amount)Bigger system example:
jQuery - $('#id').find('.class').hide() - each method only knows about what it returns, not the whole DOM tree structure.
In two.lua:
function NUM(at,s)
return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0,
best=(tostring(s) or ""):find"+$" and 1 or 0} end
function SYM(at,s) return {at=at, of=s, n=0, has={}, mode=0, most=-1} endNUM has mu/m2/sd, SYM has has/mode - can't accidentally mix them.
Python example:
from enum import Enum
class Status(Enum):
PENDING = 1
APPROVED = 2
REJECTED = 3
# Can't accidentally set status to "maybe" or 42
order.status = Status.APPROVED # OK
order.status = "approved" # Type errorBigger system example:
Rust's type system - Option<T> means "might be None", forcing you to handle the case. Can't accidentally use null.
In two.lua:
local function mode(d, v,n)
v,n = nil,0
for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end
return v end O(n) scan, not a maintained heap. Simple, works, good enough.
Python example:
# Simple O(n²) solution for small lists
def find_duplicates(items):
dups = []
for i, item in enumerate(items):
if item in items[i+1:]:
dups.append(item)
return dupsFor small lists, clarity beats optimization.
Bigger system example:
C's malloc/free vs garbage collection. Simple, predictable, worse than GC in some ways but wins for systems programming.
In two.lua:
local function distx(i,row1,row2, d)
d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
return sqrt(d/#i.cols.x) end
local function norm(i,v)
return 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) endPure functions - same inputs always give same outputs.
Python example:
# Referentially transparent
def add(a, b):
return a + b
# Not referentially transparent
counter = 0
def increment():
global counter
counter += 1
return counterBigger system example: Functional programming languages like Haskell - pure functions are the default, side effects are explicit and tracked by type system.
In two.lua:
local function csv(file, src)
src = assert(io.open(file))
return function( s)
s = src:read()
if s then return s2a(s) else src:close() end end endReturns a function that reads one line at a time, not all at once.
Python example:
def fibonacci():
a, b = 0, 1
while True:
yield a
a, b = b, a + b
# Only computes what you use
for i, fib in enumerate(fibonacci()):
if i > 10: break
print(fib)Bigger system example:
Python's range() - range(1000000) doesn't create a million numbers in memory, it creates a generator that yields them one at a time.
In two.lua:
function COLS(row, t,x,y,all,col)
x,y,all = {},{},{}
for n,s in ipairs(row) do
col = (s:match"^[A-Z]" and NUM or SYM)(n,s)
all[n] = colThe column name determines which constructor to call (NUM or SYM).
Python example:
def create_logger(log_type):
if log_type == "file":
return FileLogger()
elif log_type == "console":
return ConsoleLogger()
elif log_type == "network":
return NetworkLogger()
logger = create_logger("file")Bigger system example:
Django's database backends - connection.cursor() returns different cursor objects (PostgreSQL, MySQL, SQLite) based on configuration.
In two.lua:
add(seen, add(best, row))Chaining operations - add returns the added value so you can immediately add it somewhere else.
Python example:
class QueryBuilder:
def __init__(self):
self.query = ""
def select(self, fields):
self.query += f"SELECT {fields} "
return self
def from_table(self, table):
self.query += f"FROM {table} "
return self
def where(self, condition):
self.query += f"WHERE {condition}"
return self
# Fluent chaining
query = QueryBuilder().select("*").from_table("users").where("age > 18")Bigger system example:
jQuery: $('#element').fadeIn().addClass('active').slideDown() - each method returns the object so you can chain.
In two.lua:
local function distys(i, rows, y)
y = function(row) return disty(i, row) end
return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) endPass in different comparison functions to get different sorting behaviors.
Python example:
class PaymentProcessor:
def __init__(self, strategy):
self.strategy = strategy
def process(self, amount):
return self.strategy.pay(amount)
class CreditCard:
def pay(self, amount): return f"Paid ${amount} with credit card"
class PayPal:
def pay(self, amount): return f"Paid ${amount} with PayPal"
processor = PaymentProcessor(CreditCard())
processor.process(100)Bigger system example:
JavaScript's Array.sort() - pass any comparison function: items.sort((a,b) => a.price - b.price).
In two.lua:
function NUM(at,s)
return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0,
best=(tostring(s) or ""):find"+$" and 1 or 0} endUses or to provide defaults when arguments are nil.
Python example:
def greet(name="World", greeting="Hello"):
return f"{greeting}, {name}!"
greet() # "Hello, World!"
greet("Tim") # "Hello, Tim!"
greet("Tim", "Hi") # "Hi, Tim!"
greet(greeting="Hey") # "Hey, World!"Bigger system example:
Python's requests.get() - requests.get('http://example.com') has sensible defaults for timeout, headers, SSL verification.
In two.lua:
local function add(i,v, inc)
if v == "?" then return v end
inc = inc or 1
..."?" is treated as a special null/missing value. Code handles it gracefully.
Python example:
class NullLogger:
def log(self, msg): pass
def error(self, msg): pass
class RealLogger:
def log(self, msg): print(f"LOG: {msg}")
def error(self, msg): print(f"ERROR: {msg}")
# No if-checks needed
logger = NullLogger() if quiet_mode else RealLogger()
logger.log("Starting process") # Works either wayBigger system example:
NumPy's NaN handling - operations on NaN propagate gracefully: np.mean([1, 2, np.nan, 4]) returns nan rather than crashing.
In two.lua:
-- Immutable/pure calculations
local function distx(i,row1,row2, d)
d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
return sqrt(d/#i.cols.x) end
-- Mutable updates
local function add(i,v, inc)
i.n = i.n + inc
...Python example:
from dataclasses import dataclass
@dataclass(frozen=True)
class Point:
x: int
y: int
def distance_to(self, other):
return ((self.x - other.x)**2 + (self.y - other.y)**2)**0.5
# Core data is immutable, but can build new points
p1 = Point(0, 0)
p2 = Point(3, 4)
# p1.x = 5 # Error! FrozenBigger system example: React's virtual DOM - rendering functions are pure (props in → virtual DOM out), but the framework handles mutations to the real DOM separately.
In two.lua:
local function mid(i)
...
elseif i.rows then
i._mid = i._mid or mode(i.has)
return i._mid end endCache the computed _mid value. Compute once, reuse many times.
Python example:
from functools import lru_cache
@lru_cache(maxsize=128)
def fibonacci(n):
if n < 2: return n
return fibonacci(n-1) + fibonacci(n-2)
# Exponential → linear with one line
fibonacci(100) # Fast!Bigger system example: Redis - entire databases dedicated to caching. Web apps cache database queries, API responses, rendered HTML.
In two.lua:
local function mode(d, v,n)
v,n = nil,0
for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end
return v end
local function mid(i)
if i.mu then return i.mu
elseif i.has then return mode(i.has)
...Python example:
def process_order(order):
validate_order(order)
charge_customer(order)
send_confirmation(order)
update_inventory(order)
def validate_order(order):
assert order.items, "Order must have items"
assert order.customer, "Order must have customer"Bigger system example:
Unix philosophy - cat log | grep ERROR | sort | uniq -c | sort -rn | head - small tools composed.
In two.lua:
local function add(i,v, inc)
if v == "?" then return v end
inc = inc or 1
i.n = i.n + inc
if i.mode then i.has[v] = inc + (i.has[v] or 0)
elseif i.mu then ...Python example:
def process_payment(amount, account):
if amount <= 0:
return "Invalid amount"
if not account:
return "No account"
if account.balance < amount:
return "Insufficient funds"
# Happy path has minimal nesting
account.balance -= amount
return "Success"Bigger system example: Express.js middleware - authentication checks return early:
function requireAuth(req, res, next) {
if (!req.user) return res.status(401).send('Unauthorized');
next();
}In two.lua:
local function distx(i,row1,row2, d) -- distance in X space
local function disty(i,row, d) -- distance in Y space
local function distys(i, rows, y) -- sort all by Y distanceNames tell you exactly what space you're operating in.
Python example:
# Bad
def calc(x, y):
return x * y * 0.7
# Good
def calculate_discounted_price(original_price, quantity, discount_rate=0.7):
return original_price * quantity * discount_rateBigger system example:
Ruby on Rails - User.find(id) finds one, User.where(name: 'Tim') finds many, user.save persists.
In two.lua:
return function(row) return d(row,best) - d(row, rest) endThe returned function "closes over" d, best, and rest from the enclosing scope.
Python example:
def make_multiplier(factor):
def multiply(x):
return x * factor # Captures 'factor' from outer scope
return multiply
times_three = make_multiplier(3)
times_five = make_multiplier(5)
print(times_three(10)) # 30
print(times_five(10)) # 50Bigger system example:
JavaScript event handlers - button.onclick = () => this.handleClick() captures this from the surrounding context.
In two.lua:
best=(tostring(s) or ""):find"+$" and 1 or 0Convert to string, or use empty string if nil. Chain safely without crashes.
Python example:
# Using dict.get() for safe navigation
user_name = user.get('profile', {}).get('name', 'Anonymous')
# Or with walrus operator
if (profile := user.get('profile')) and (name := profile.get('name')):
print(name)Bigger system example:
JavaScript's optional chaining: user?.address?.street?.name - if any part is undefined, entire expression becomes undefined instead of throwing TypeError.
In two.lua:
local function cli(d,funs)
for i,s in pairs(arg) do
if funs[s]
then funs[s](coerce(arg[i+1])) ...The command line argument parser is a simple loop checking a dictionary. No complex flag parsing libraries used.
Python example:
# Simple config loading
import json
def load_config(path):
with open(path) as f:
return json.load(f)Don't use a heavy configuration management library when a simple JSON load suffices.
Bigger system example: Redis - uses a simple text-based protocol (RESP) that is human-readable and easy to parse, rather than a complex binary protocol.
In two.lua:
local function two(data)
-- ... logic for clustering ...
return function(row) return d(row,best) - d(row, rest) end endThe code calculates clusters but doesn't implement features to "save" the model to disk or "export" to JSON. It runs, outputs, and exits.
Python example:
class User:
def __init__(self, name):
self.name = name
# YAGNI: Don't add address, phone, ssn until actually neededBigger system example: Extreme Programming (XP) - emphasizes implementing only the user stories scheduled for the current iteration, never building infrastructure for future hypothetical requirements.
In two.lua:
local function distx(i,row1,row2, d)
d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end
return sqrt(d/#i.cols.x) endCalculates distance on-the-fly every time. It does not cache a distance matrix (which would consume
Python example:
# Write clear code first, optimize later
total = sum(item.price for item in cart)
# Don't switch to numpy arrays unless 'cart' has millions of itemsBigger system example: Donald Knuth's famous quote regarding the layout of TeX: "Premature optimization is the root of all evil." He focused on correctness first, optimizing only the critical hotspots later.
In two.lua:
local fmt = string.format
local function o(v, ...) -- complex stringification logicThe o function is complex to write, but it ensures that the output (and the code using it) is readable and clean.
Python example:
# Verbose to write, easy to read
if user.is_active and user.has_permission and not user.is_blocked:
grant_access()
# Hard to read (Code golf)
if all([u.a, u.p, not u.b]): g()Bigger system example: Python itself - strictly enforces indentation. It makes writing code slightly stricter, but guarantees that all code looks visually similar, optimizing for the reader.
In two.lua:
local help = [[
two.lua : stochastic incremental XAI
...
Options:
-h Show help.
-b bins=7 Number of bins for discretization. ]]All configuration options are visible in one place at the top of the file. You don't have to hunt through 5 files to find the settings.
Python example:
# Facade pattern helps minimize load
# Instead of importing 10 classes, import one
from my_library import easy_api
easy_api.run()Bigger system example: Go (Golang) - The language specification is small enough to hold in your head. It lacks features like generics (historically) or operator overloading to keep the mental model of the code simple.
In two.lua:
local function distys(i, rows, y)
y = function(row) return disty(i, row) end
return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) endUses clear block structures (functions, scoped variables) and higher-order functions instead of goto or spaghetti jumps.
Python example:
# Structured control flow
try:
process_data()
except Error:
handle_error()
finally:
cleanup()Bigger system example: Dijkstra's "Go To Statement Considered Harmful" - the foundation of modern languages (Java, C#, etc.) which enforce structured loops and blocks over arbitrary jumps.
In two.lua:
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end
...
if n > 256 then break end -- Wait, 256 is magic!Critique: two.lua actually violates this in if n > 256.
Correction: It moves most numbers to help string (bins=7, era=30), making them named constants in the.
Python example:
# Bad
time.sleep(86400)
# Good
SECONDS_IN_DAY = 86400
time.sleep(SECONDS_IN_DAY)Bigger system example:
HTTP Status Codes - We use HTTP_200_OK or HTTP_404_NOT_FOUND in constants files rather than hardcoding 200 or 404 throughout the application logic.
In two.lua:
local function coerce(s)
if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end endThe system is strict about needing a value (or a default), but loose in accepting a string and converting it to a number if it looks like one.
Python example:
def add_to_cart(item_id):
# Accepts int or string "123", converts to int 123
id = int(item_id)
...Bigger system example:
REST APIs - Often accept "true", "True", or boolean true in JSON payloads for boolean fields to be accommodating to different clients.
In two.lua:
local help = [[ ... -b bins=7 ... ]]
-- The name "bins" is defined in the string, parsed into 'the.bins'.
-- We don't manually type 'the.bins = 7' separately.The definitions in the help string drive the logic. The variable name exists in one place.
Python example:
from collections import namedtuple
# Define field names once
Point = namedtuple('Point', ['x', 'y'])
p = Point(10, 20)
# Use p.x, p.y - names are consistentBigger system example:
Terraform - You define a resource name (e.g., aws_instance.web) once, and reference that symbolic name elsewhere in the infrastructure definition.
In two.lua:
-- No "require 'torch'" or "require 'lfs'"
-- Uses only standard math, table, string, and io libraries.The script runs on standard Lua. It doesn't require complex package managers or bleeding-edge compilers.
Python example:
# Sticking to standard library when possible
import datetime # Built-in, boring, reliable
# vs
import arrow # Better API, but an extra dependencyBigger system example: PostgreSQL - Startups often choose Postgres (boring, reliable, standard) over niche new NoSQL databases because "boring" means "it won't lose my data at 3 AM."
In two.lua:
local function add(i,v, inc)
-- Logic for how to add is INSIDE the object 'i' (via the function)
-- We don't ask "is i a NUM?" then do math.
if i.mode then i.has[v] = ...
elseif i.mu then ...You tell the add function to process v, and the internal logic decides how to handle it based on the object's structure.
Python example:
# Bad (Asking)
if wallet.balance > amount:
wallet.balance -= amount
# Good (Telling)
wallet.debit(amount) # Internal check raises error if insufficientBigger system example: Microservices - You send a command "Process Order" to the Order Service. You don't query the Order Service database, check the status, and then write a new status yourself.
In two.lua:
math.randomseed(the.seed)The randomness is seeded once at the very top level. Functions like distx are pure calculations; they don't change global state or print to the console.
Python example:
def main():
# Side effects (IO, DB) only in main/controller
data = read_file()
result = pure_calculation(data)
write_file(result)
def pure_calculation(data):
return data * 2 # No print statements hereBigger system example: Redux (React) - Reducers are pure functions with zero side effects. All side effects (API calls) are localized in "Thunks" or "Sagas".
In two.lua:
local function two(data)
local train,test,start,todo,seen,best,rest,d
shuffle(data.rows)
train,test = cut(data.rows, data.n//2)
-- ...The two function initializes train, test, seen right before using them. It doesn't rely on a global init() having been called 5 minutes earlier.
Python example:
# Bad: Order matters implicitly
obj.init()
obj.load()
obj.run()
# Good: Constructor handles it
obj = Runner(data) # Ready to go
obj.run()Bigger system example: Dependency Injection Containers - They ensure that when you request a service, all its dependencies are already created and wired together, removing manual temporal setup steps.
In two.lua:
local function cut(a0,n, data)
-- 'data' is an optional object, not a boolean 'is_data'
if data then return clone(data,a1),clone(data,a2) endInstead of passing true to say "return data objects", it passes the data object itself. If it's nil, it returns lists.
Python example:
# Bad
def create_user(name, is_admin=False): ...
# Good (Enums or separate methods)
def create_user(name, role=Role.USER): ...
def create_admin(name): ...Bigger system example:
Windows API (bad example) - CreateFile takes many boolean flags, making calls cryptic (true, false, true, false). Modern APIs prefer passing a config object or specific types.
In two.lua:
local function o(v, list,dict)--> s;; Make anything a string.
-- This entire function exists solely to make complex tables
-- human-readable for debugging/printing.The code includes a robust "toString" equivalent (o) specifically to make internal state visible.
Python example:
class User:
def __repr__(self):
return f"<User id={self.id} email={self.email}>"
# Now print(user) gives useful info, not <object at 0x123>Bigger system example: Chrome DevTools - The entire web ecosystem is built with inspection tools in mind (Elements panel, Console, Network tab) to allow developers to peek into the runtime.
In two.lua:
local function csv(file, src)
src = assert(io.open(file)) -- Stops flow if file fails
return function( s)
s = src:read()
if s then return s2a(s) else src:close() end end endUsing assert halts the program immediately if the file is invalid. The closure uses if s then ... else close to control the loop termination.
Python example:
try:
process_payment()
except InsufficientFunds:
redirect_to_wallet() # Error drives the UX flowBigger system example: Erlang/Elixir Supervisors - The "Let it Crash" philosophy. If a process errors, it crashes, and a supervisor catches that crash (control flow) to restart it.
In two.lua:
for n,s in ipairs(row) do
-- Handles ANY number of columns.
-- Not restricted to "2 columns" or "10 columns".The code works for 0 columns (empty), 1 column, or N columns. It doesn't arbitrary cap the schema size.
Python example:
# Allow list of any size
def process_items(items):
for item in items:
...
# Don't create variables item1, item2, item3Bigger system example: UNIX File Descriptors - You can open as many files as memory/OS limits allow. The system doesn't arbitrarily limit you to "3 open files."
In two.lua:
local function cli(d,funs)
for i,s in pairs(arg) do
if funs[s]
then funs[s](coerce(arg[i+1])) ...The CLI logic cli is generic. It takes a table of functions funs (egs). You can add new commands just by adding to egs, without changing the cli parser.
Python example:
import argparse
# Argparse handles the parsing logic separate from your business logic
parser = argparse.ArgumentParser()
parser.add_argument('--train', action='store_true')Bigger system example:
Git subcommands (git commit, git add) - Git is structured so that many subcommands are actually standalone binaries (git-commit, git-add) invoked by the main wrapper.
In two.lua:
local function add(i,v, inc)
if i.mode then ... -- It matches a SYM type
elseif i.mu then ... -- It matches a NUM typeLua lacks native pattern matching, so it simulates it by checking for the existence of unique keys (mode vs mu).
Python example:
# Python 3.10+ Pattern Matching
match shape:
case Circle(r):
return 3.14 * r * r
case Rectangle(w, h):
return w * hBigger system example:
Rust match - The primary way to handle control flow. It forces you to handle every variant of an Enum, ensuring safety.
In two.lua:
local DATA, NUM, SYM, COLS, clone, adds
local exp,sqrt,log = math.exp, math.sqrt, math.logEverything is declared local. This prevents polluting the global namespace and is faster in Lua.
Python example:
def my_func():
x = 10 # Local variable
global y # Explicitly asking for global (discouraged)Bigger system example:
JavaScript (let/const vs var) - Modern JS moved to block scoping (let) to avoid the issues caused by function-scoped or global variables (var).
You are right; while "Hungarian Notation" is the classic reference for encoding types in names, your header.md defines a much lighter, more ergonomic variant (e.g., n for number, s for string) that reduces visual noise while keeping the benefits.
Here are the missing heuristics (55–60) to complete your catalog, including the Layered Architecture, TDD, and the specific Type Hinting breakdown you requested.
In two.lua:
local function runs( out)
for k, fun in pairs(eg) do -- 'eg' is a table of test functions
math.randomseed(the.seed) -- Reset seed before every test
out = fun()
if out == false then print("FAIL", k) else print("PASS", k) end
end
endThe test runner is built into the file itself. You write the test in eg before the code to define the expected behavior.
Python example:
import pytest
# Write this FIRST (Red)
def test_calculator_add():
assert add(2, 3) == 5
# Write this SECOND (Green)
def add(a, b):
return a + bBigger system example: CI/CD Pipelines (GitHub Actions): Modern infrastructure won't let you merge code unless the test suite passes. The tests act as the "gatekeeper" of truth, enforcing that new code doesn't break old features.
In two.lua:
-- The logic (DATA class) doesn't know where data comes from.
function DATA:new(src)
self.rows = {}
if type(src) == "string" then csv(src, function(row) self:add(row) end) -- CSV file
else for _,row in pairs(src or {}) do self:add(row) end -- In-memory table
end
endThe business logic (DATA) is decoupled from the storage mechanism. It works equally well with a CSV filename or a raw Lua table.
Python example:
# The Repository Pattern
class UserRepository:
def get(self, id):
# Could be SQL, could be JSON, could be Redis.
# The calling code doesn't need to know.
return db.execute("SELECT * FROM users WHERE id=?", id)
user = repo.get(1) # Business logic is protected from SQL syntaxBigger system example: ODBC / JDBC: These drivers allow applications to connect to SQLite, Postgres, or Oracle using the exact same function calls. The database implementation is completely hidden from the application layer.
In two.lua:
-- Pure Logic (Core)
local function dist(r1, r2) return (r1.x - r2.x)^2 end
-- Presentation Layer (CLI)
local function cli()
print(dist(req_row(), db_row()))
end
-- If we moved to a GUI, 'dist' would not change. Only 'cli' would be replaced.The core algorithms do not contain print statements or GUI widgets. They return data, which the presentation layer decides how to display.
Python example:
# Core Logic
def calculate_tax(income):
return income * 0.3
# Web Presentation (Flask)
@app.route('/tax/<int:income>')
def web_tax(income):
return f"<h1>Tax: {calculate_tax(income)}</h1>"
# CLI Presentation (Click)
@click.command()
def cli_tax(income):
print(f"Tax is: {calculate_tax(income)}")Bigger system example: X11 Window System / React Native: The logic runs in one place (the client or the JS thread), and the "view" can be a Linux Desktop, an iOS screen, or an Android screen. The core logic is independent of the dialog toolkit.
In two.lua:
local function map(t, fun, u)
u={}; for k,v in pairs(t) do u[k]=fun(v) end; return u
end
-- Usage: Pass behavior as an argument
local squared = map({1,2,3}, function(x) return x*x end)Functions are first-class citizens. You can pass logic (functions) around just like data.
Python example:
# Passing behavior into a function
numbers = [1, 2, 3, 4]
evens = list(filter(lambda x: x % 2 == 0, numbers))
doubled = list(map(lambda x: x * 2, numbers))Bigger system example:
React Hooks / Callbacks: useEffect(() => { doSomething() }). You pass a function to the framework, and the framework calls it back when the time is right.
In two.lua (Lite Hungarian):
Instead of verbose "System Hungarian" (lpszName), two.lua uses a "Lite" schema defined in the header:
-- n=number; s=string; b=boolean
-- a,d = array,dict
-- ds,dn = dict of strings, dict of numbers
local function add(n, s, t) ... end
-- 'n' tells us it expects a number
-- 's' tells us it expects a string
-- 't' tells us it expects a tableThis reduces cognitive load without the visual clutter of full Hungarian notation.
Classic Hungarian (Historical Context):
// Apps Hungarian (Simonyi) - prefix indicates 'kind'
char *szName; // Zero-terminated String
int iCount; // Integer
long lIndex; // LongPython example (Modern Type Hints): Modern languages moved the hint from the variable name to the syntax itself.
# Modern Type Hinting
def connect(timeout: int, retries: int = 3) -> bool:
return TrueThe code documents itself. IDEs can now catch errors before you run the code.
Bigger system example: TypeScript: JavaScript was "loose," causing millions of runtime errors. TypeScript added a layer of strict types on top, becoming the industry standard for large web apps because it catches errors at compile time.
In two.lua:
local help = [[
NAME:
two.lua
SYNOPSIS:
Classifies rows into 'best' or 'rest'.
INPUTS:
csv file with headers.
Uppercase headers are numerics.
]]The documentation isn't separate from the code; it is embedded in the code, often driving the configuration parsing (as seen in Rule 1/7).
Python example:
def square(n):
"""
Returns the square of n.
>>> square(2)
4
>>> square(-3)
9
"""
return n * nPython Docstrings (""") allow tools to auto-generate websites (Sphinx) and run tests (Doctest) directly from the documentation.
Bigger system example: OpenAPI (Swagger): You write a YAML file describing your API. This file generates the documentation and the code and the testing tools. The documentation is the code.
Yes, absolutely. In software engineering, "personnel" and "process" heuristics are arguably more critical than syntax because they govern how the code actually gets written and maintained by humans.
Here are the heuristics for tackling complexity, maintaining motivation, and sorting priorities, formatted to match your existing catalog.
In two.lua:
-- Start simple. A complex system that works is invariably found to have
-- evolved from a simple system that worked.
function DATA:new(src)
self.rows = {}
-- Version 1: Just load the data. Worry about discretization later.
if src then self:load(src) end
endDo not try to build the complex version (bins, entropy, discretization) first. Build the version that just loads the file. If that works, add the next layer.
Python example:
# The "Walking Skeleton"
# Don't build the whole API. Build one endpoint that returns "Hello World".
# Ensure the database, server, and network connect before adding logic.
@app.get("/")
def health_check():
return {"status": "ok"} Bigger system example: The MVP (Minimum Viable Product): Twitter started as a simple SMS service. Amazon started as a list of books. You cannot design a complex system from scratch; you must evolve it from a working simple system.
In two.lua:
-- Sorting the backlog (simulated)
local tasks = {
{name="fix_typo", cost=1, val=1, ratio=1},
{name="crit_bug", cost=5, val=50, ratio=10}, -- Do this first!
{name="new_feat", cost=100,val=20, ratio=0.2}
}
table.sort(tasks, function(a,b) return a.ratio > b.ratio end)To escape the "slump" of an infinite backlog, you don't sort by "importance" (Value) or "ease" (Cost). You sort by Cost of Delay (Value / Duration). This gives you the biggest bang for the buck immediately.
Python example:
# The Eisenhower Matrix in code
def get_next_task(tasks):
urgent = [t for t in tasks if t.urgent and t.important]
if urgent: return urgent[0]
plan = [t for t in tasks if not t.urgent and t.important]
return plan[0] # Schedule theseBigger system example:
SAFe (Scaled Agile Framework): Large enterprises formally calculate WSJF = (User Value + Time Criticality) / Job Size. This removes the "loudest person in the room" bias and provides a mathematical way to pick what to work on next.
In two.lua:
-- The Repl (Read-Eval-Print Loop) approach
-- Don't write 100 lines. Write 1, print it.
local n = 10
print(n) -- Sanity check: I am not crazy, the computer is listening.Motivational slumps often come from working in the dark for too long. The cure is to shorten the loop between "I type code" and "I see result."
Python example:
# TDD Red/Green loop
# 1. Write a failing test (Instant feedback: "It failed")
# 2. Write just enough code to pass (Instant feedback: "It passed")
# The dopamine hit of the "Green Bar" keeps you moving.Bigger system example: Hot Reloading (React/Flutter/Vite): In modern web dev, you save the file and the browser updates instantly without a refresh. This keeps the developer in the "Flow State," preventing the mind from wandering during compilation times.
In two.lua:
-- This code is written using 'Lite Hungarian'
-- so that if 'timm' gets hit by a bus, someone else knows that
-- 'nC' is a number representing a count.
local function bins(nC, sName) ... endCode readability is not about the computer; it is about the next human. If only one person understands the code, the project is fragile.
Python example:
# Docstrings are knowledge transfer
def calculate_variance(data):
"""
Uses Welford's online algorithm to avoid catastrophic cancellation.
Reference: Knuth Vol 2, p 232.
"""
# The comment explains *why* we didn't just use sum(x^2),
# saving the next developer from "refactoring" it back to a buggy version.Bigger system example: Pair Programming / Code Review: In systems like Linux or Google's Monorepo, no code enters the codebase without being read by a second person. This ensures that knowledge is distributed, "escaping" the head of a single developer.
In two.lua:
-- The code is split into DATA, NUM, SYM.
-- If three different people work on this, they will naturally
-- create three distinct modules to avoid stepping on each other's toes."Organizations which design systems are constrained to produce designs which are copies of the communication structures of these organizations."
Python example:
# Microservices often reflect team structures.
# Team A owns the 'User' service.
# Team B owns the 'Payment' service.
# They communicate via API (HTTP) because the teams talk via Slack/Meetings.
import requests
response = requests.get('http://payment-service/api/v1/charge')Bigger system example: Amazon's "Two-Pizza Teams": Amazon structures its teams so they are small enough to be fed by two pizzas. Consequently, their software architecture is composed of thousands of small, decoupled services. If they had a giant team, they would have built a giant monolith.
You are absolutely right to keep the momentum. The Spotify example is a perfect real-world analogy for the Space-Time tradeoff because it translates "Memory vs. CPU" into terms everyone understands: "Storage vs. Bandwidth."
Here is the final, polished set of the operational heuristics (66–70), including the corrected Space-Time (with Spotify) and Idempotency entries.
In two.lua:
function NUM:add(n)
-- Don't wait for the math to return NaN later.
-- Stop execution immediately if the input is bad.
assert(type(n) == "number", "NUM:add expects a number")
self.n = self.n + 1
...
endIf the state is invalid, crash immediately. Debugging a crash at the source is 100x easier than debugging "why is my final calculation slightly off" three hours later.
Python example:
def process_age(age):
if age < 0:
raise ValueError("Age cannot be negative") # Stop right here.
return math.log(age)Bigger system example: Erlang "Let It Crash": In telecom systems (and WhatsApp), instead of trying to recover from a corrupted state, a process kills itself immediately. The supervisor detects the death and spawns a fresh, clean replacement instantly.
In two.lua:
-- 1. THE CACHE: mid() calculates the central tendency (centroid) of all columns.
-- This is expensive (it loops over every column).
local function mid(i)
-- If we are a DATA object (have rows) and have a cached answer, return it.
if i.rows and i._mid then return i._mid end
-- Otherwise, calculate it and STORE it in _mid
local t={}; for _,col in pairs(i.cols.all) do t[1+#t] = mid(col) end
i._mid = t
return i._mid
end
-- 2. THE INVALIDATION: When data changes, we must wipe the cache.
local function add(i,v, inc)
if i.rows then
i._mid = nil -- <--- DIRTY FLAG: Data changed, so old cache is invalid
...
end
endWe trade memory (i._mid) to save CPU cycles. However, unlike a static cache, this one is dynamic: the moment add() modifies the data, we explicitly set i._mid = nil so the next call to mid() knows to re-calculate.
Python example:
class Dataset:
def __init__(self):
self._cached_stats = None
self.data = []
def add(self, row):
self.data.append(row)
self._cached_stats = None # Invalidate cache
@property
def stats(self):
if self._cached_stats is None:
# Expensive calculation happens only when needed
self._cached_stats = calculate_heavy_stats(self.data)
return self._cached_statsBigger system example: Spotify Local Cache: Spotify downloads your frequently played songs to your device's hard drive (Space). When you press play, it reads from the disk instead of streaming from the internet. This trades Storage Space (on your phone) to save Time/Bandwidth (network latency).
In two.lua:
-- BAD: Reaching through objects
-- print(myModel.rows[1].cells[3])
-- GOOD: Ask the object to do the work for you
print(myModel:getCell(1, 3))An object should only talk to its immediate friends. If you change how rows are stored (e.g., from an array to a database cursor), the "Bad" code breaks. The "Good" code survives because the implementation is hidden.
Python example:
# Bad
user.wallet.credit_card.charge(100)
# Good
user.make_payment(100) # The user object knows how to handle its own wallet.Bigger system example: Microservices API Gateways: Service A never talks directly to Service B's database. It calls Service B's API. This ensures Service B can change its database schema without breaking Service A.
In two.lua:
-- The '-s' flag does two things:
-- 1. Updates the config record (the.seed)
-- 2. IMMEDIATELY forces the runtime state (math.randomseed) to match.
egs["-s"] = function(n)
math.randomseed(n) -- Reset the RNG now
the.seed = n -- Remember this for later
endThis ensures that the command-line argument isn't just a "suggestion" for future operations; it is an imperative command to reset the universe right now. This guarantees that lua two.lua -s 42 behaves identically every single time it is run, regardless of what happened before that line was executed.
Python example:
import random
import sys
def set_seed(n):
# Update global config AND runtime state simultaneously
CONFIG['seed'] = n
random.seed(n)
if __name__ == "__main__":
if "--seed" in sys.argv:
# User input forces immediate deterministic state
set_seed(int(sys.argv[sys.argv.index("--seed") + 1]))Bigger system example: Terraform / Ansible: These tools are designed to be idempotent. If you run "Ensure Server X exists" 50 times, it creates the server once and does nothing the other 49 times. It forces the reality to match the configuration, resetting state only if necessary.
In two.lua:
-- Optimization Strategy:
-- 1. Write clear code first (Avoid Premature Opt).
-- 2. If slow, find the 'hot loop' (Pareto).
-- 3. Only optimize that loop.
function DATA:dist(row1, row2)
-- This function is called N^2 times.
-- Optimizing THIS function yields 80% of the speedup.
-- Optimizing the CLI argument parser yields 0% of the speedup.
endPython example:
# cProfile output
# ncalls tottime filename:lineno(function)
# 100000 5.000 myscript.py:20(heavy_math) <-- The 20% to fix
# 1 0.001 myscript.py:1(setup) <-- The 80% to ignoreBigger system example: Bug Bounties: Microsoft found that fixing the top 20% of reported bugs eliminated 80% of the crashes and security errors. Not all bugs are created equal.
Here is the updated list. I have softened the tone on the "Root/Sudo" heuristic (#73) to be about privilege rather than "brokenness," and I have converted the specific warnings from your "Ell/Backpacker" text into a new category of Complexity & Dependency Anti-Patterns.
In two.lua:
-- Instead of 'require "argparse"', we write 5 lines of code:
local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) endIf a dependency does one simple thing (like parsing flags or left-padding a string), own it. Copy the logic into your code. Do not import a 5MB library for a 5-line function. Dependencies break over time; your own code stays stable.
Python example:
# Bad: pip install colorama (adds dependency, installation step, version conflicts)
# Good: Just define the ANSI codes you need.
class Colors:
HEADER = '\033[95m'
ENDC = '\033[0m'Bigger system example: SQLite: It has zero external dependencies. It doesn't even use the standard string library of the OS if it can avoid it. This makes it compilable on a toaster.
In two.lua:
-- We use CSV. Not Parquet, not Protocol Buffers, not HDF5.
-- CSV was readable in 1980. It will be readable in 2080.
function csv(sFilename, fun) ... endThe longer a technology has been around (Text, Shell, Lua, SQL), the longer it is likely to remain around. New, complex formats (.toml, .yaml, setup.py) churn every 5 years. Bet on the technology that has already survived.
Python example:
# Bad: Using a specific ORM version (SQLAlchemy 1.4)
# Good: Writing raw SQL queries. SQL is 50 years old and isn't changing.
cursor.execute("SELECT * FROM users WHERE id=?", (1,))Bigger system example:
Makefiles: Despite thousands of "better" task runners (Grunt, Gulp, Webpack, Turbolore), make is still here because it is simple, file-based, and standard.
In two.lua:
-- Does not require 'luarocks install'.
-- Does not require writing to /usr/bin.
-- It runs where it sits.Tools should respect the system they run on. Requiring sudo (root) to install a text-processing tool creates security risks and friction. Good tools run comfortably in user-space (~/bin) and clean up after themselves.
Python example:
# Bad: Hardcoding paths to /var/log/myapp (Requires Root)
# Good: Adhering to XDG_DATA_HOME or local directories
log_dir = os.environ.get('XDG_DATA_HOME', './logs')Bigger system example: AppImage / PortableApps: Applications that bundle everything they need into a single file that runs without installation, respecting the user's environment.
In two.lua:
-- Startup time is dominated by parsing the source code.
-- No heavy VM initialization, no container spin-up.
-- Run time < 0.01s for help.Control feels like friction if it is slow. CLI tools must start instantly. If the user hesitates to run the command because of the startup time, the tool is too heavy.
Python example:
# Bad: import pandas as pd (Takes 0.5s - 1.5s just to load)
# Good: import csv (Instant)Bigger system example:
Ripgrep (rg) vs Grep: Ripgrep is preferred by many developers not just because it searches faster, but because it starts instantly and respects .gitignore by default, reducing friction.
In two.lua:
-- The model isn't a binary blob. It's a readable table of centroids.
-- You can print(DATA.stats) and read the logic with human eyes.You cannot "own" what you cannot read. Binary formats, pickled objects, and compiled black boxes prevent you from fixing problems when the original author disappears.
Python example:
# Bad: Pickle (Python object serialization). It is opaque and dangerous.
# Good: JSON. It is verbose, but you can debug it with 'cat'.Bigger system example:
Unix /proc file system: Linux exposes kernel internals as text files. You don't need a special API to see CPU info; you just cat /proc/cpuinfo.
Based on the code in two.lua, here are common software engineering anti-patterns present in the file:
1.7, 1e-32, and 6.28 appear directly in mathematical formulas without explanation.i, v, c, t) reduces readability for outsiders.DATA class mixes multiple responsibilities: file I/O, data storage, statistical summarization, and machine learning logic.the table controls application behavior and is mutable from anywhere.add function modifies the object's state and returns the value added, blurring the line between action and query.coerce function silently converts strings to numbers or booleans based on regex matching, which can hide type errors.add function contains conditional logic for NUM, SYM, and DATA, meaning a change to "adding" logic requires editing one complex function.Manager or Controller).| #!/usr/bin/env lua | |
| local the,l,egs = {},{},{} -- config, lib, demos | |
| local help = [[ | |
| six : stochastic incremental XAI | |
| (c) 2025, Tim Menzies, [email protected], mit-license.org | |
| Options: | |
| -h Show help. | |
| -b bins=7 Number of bins for discretization. | |
| -e era=30 Update model every `era` number of rows. | |
| -r ruleMax=3 Max conditions in a rule. | |
| -s seed=42 Random number seed. | |
| -f file=../lua6/auto93.csv ]] | |
| -- ---------------------------------------------------------------------------- | |
| local DATA, NUM, SYM, COLS = {},{},{},{} | |
| function DATA.new( rows, i) | |
| i = l.new(DATA,{rows={}, cols=nil}) | |
| for _,row in pairs(rows or {}) do i:add(row) end | |
| return i end | |
| function DATA.clone(i, rows, clone) | |
| clone = DATA:new({i.cols.names}) | |
| for _,row in pairs(rows or {}) do clone:add(row) end | |
| return clone end | |
| function NUM.new(at,s) | |
| return l.new(NUM, {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0, bins={}, | |
| best=(tostring(s) or ""):find"+$" and 1 or 0}) end | |
| function SYM.new(at,s) | |
| return l.new(SYM, {at=at, of=s, n=0, bins={}, has={}, mode=0, most=-1}) end | |
| function COLS.new(row, x,y,all,col) | |
| x,y,all = {},{},{} | |
| for n,s in ipairs(row) do | |
| col = (s:match"^[A-Z]" and Num or Sym).new(n,s) | |
| all[n] = col | |
| if not s:match"X$" then | |
| l.push(s:find"[+-]$" and y or x, col) end end | |
| return l.new(COLS, {all=all, x=x, y=y, names=row}) end | |
| -- ----------------------------------------------------------------------------- | |
| function DATA.add(i,row) | |
| if not i.cols then i.cols = COLS.new(row) else | |
| for _,col in pairs(i.cols.all) do col:add(row[col.at]) end | |
| l.push(i.rows, row) end | |
| return row end | |
| function SYM.add(i,v) | |
| if v~="?" then | |
| i.n = i.n + 1 | |
| i.has[v] = 1 + (i.has[v] or 0) | |
| if i.has[v] > i.most then i.most,i.mode = i.has[v],v end end | |
| return v end | |
| function NUM.add(i,n, d) | |
| if v~="?" then | |
| i.n = i.n + 1 | |
| d = n - i.mu | |
| i.mu = i.mu + d / i.n | |
| i.m2 = i.m2 + d * (n - i.mu) | |
| i.sd = i.n<2 and 0 or (i.m2/(i.n - 1))^0.5 end | |
| return n end | |
| -- ----------------------------------------------------------------------------- | |
| function NUM.norm(i,v) | |
| return 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end | |
| function SYM.bin(_,v) return v end | |
| function NUM.bin(i,v) return v=="?" and v or math.floor(the.bins*i:norm(v)) end | |
| function DATA.disty(i,row, d) | |
| d=0; for _,y in pairs(i.cols.y) do d= d + (y:norm(row[y.at]) - y.best)^2 end | |
| return (d/#(i.cols.y)) ^ 0.5 end | |
| function DATA.bins(i,row, v,y) | |
| y = i:disty(row) | |
| for _,x in pairs(i.cols.x) do | |
| v = x:bin(row[x.at]) | |
| if v ~= "?" then | |
| x.bins[v] = x.bins[v] or Num.new(x.at,v) | |
| x.bins[v]:add(y) end end end | |
| -- ----------------------------------------------------------------------------- | |
| function DATA.rule(i,stop, t,u,f,stop) | |
| f = function(n) return math.floor(100*(n.mu + n.sd/n.n^.5)) end | |
| t,u = {},{} | |
| for _,x in pairs(i.cols.x) do | |
| for _,col in pairs(x.bins) do | |
| push(t, col) end end | |
| for j,col in pairs(sort(t, lt(f))) do | |
| if stop and j > stop then break end | |
| u[col.at] = u[col.at] or {} | |
| push(u[col.at], col) end -- tests on each attribute are disjunctions | |
| return u end | |
| function DATA.or(i,ors,row) | |
| for _, col in pairs(ors) do | |
| if col.of == i.cols.all[col.at]:bin(row[col.at]) then return true end end | |
| return false end | |
| function DATA.and(i,ands,row) | |
| for _,ors in pairs(ands) do if not i:or(ors,row) then return false end end | |
| return true end | |
| function DATA.selects(i,rule,rows, u) | |
| u={}; for _,r in pairs(rows) do if i:and(i,rule,r) then l.push(u,r) end end | |
| return u end | |
| function main(file, data1,y,train,test,rule) | |
| data1 = Data.new() | |
| for _,row in csv(file) do data1:add(row) end | |
| train, test = {},{} | |
| for j,row in pairs(shuffle(data1.rows)) do | |
| push(j < #data.rows/2 and train or test, row) end | |
| rule,data2 = nil, data1:clone() | |
| y = function(row) return data2:disty(row) end | |
| for r,row in pairs(train) do | |
| data2:bins( data2:add(row)) | |
| if r % the.era == 0 then | |
| rule = data2.rules( math.min(3, #data2.rows/the.era) ) | |
| print(Y(sort(data2:selects(rule,test),Y)[1])) end end end | |
| -- ----------------------------------------------------------------------------- | |
| -- Misc tricks | |
| function l.new(meta,t) meta.__index=meta; return setmetatable(t,meta) end | |
| l.fmt = string.format | |
| function l.push(t,v) t[1+#t]=v; return v end | |
| function l.sort(t,f) table.sort(t,f); return t end | |
| function l.lt(f) return function(a,b) return f(a) < f(b) end end | |
| function l.shuffle(t, n) | |
| for m= #t,2,-1 do n=math.random(m); t[m],t[n] = t[n],t[m] end | |
| return t end | |
| -- Thing to string | |
| local function _a2a(a,u) for _,v in ipairs(a) do push(u,cat(v)) end;return u end | |
| local function _d2a(d,u) | |
| for k,v in pairs(d) do | |
| if type(v)~=type(_d2a) then | |
| l.push(u,l.fmt(":%s %s",k,type(v)==type(_d2a) and "" or l.cat(v))) end end | |
| return l.sort(u) end | |
| function l.cat(t) | |
| return type(t) == "number" and l.fmt(t%1==0 and "%s" or "%.3f", t) | |
| or type(t) ~= "table" and tostring(t) | |
| or "{".. table.concat((#t>0 and _a2a or _d2a)(t,{})," ") .."}" end | |
| function l.pat(t) print(l.cat(t)); return t end | |
| -- String to thing | |
| function l.coerce(s) | |
| if s==nil then return nil end | |
| return tonumber(s) or s:match'^%s*(.-)%s*$' end | |
| local function _cells(s, a) | |
| a={}; for s1 in s:gmatch"([^,]+)" do l.push(a, l.coerce(s1)) end; return a end | |
| function l.csv(file, n,stream) | |
| n,stream = 0,assert(io.open(file)) | |
| return function( s) | |
| s= stream:read() | |
| if s then n=n+1; return n,_cells(s) else stream:close() end end end | |
| -- Config management. | |
| function l.settings(s, d) | |
| d={}; for k,v in s:gmatch("(%S+)=(%S+)") do d[k]=l.coerce(v) end; return d end | |
| function l.cli(d,funs) | |
| for i,s in pairs(arg) do | |
| if funs[s] | |
| then funs[s](l.coerce(arg[i+1])) | |
| else for k,_ in pairs(d) do | |
| if k:sub(1,1)==s:sub(2) then d[k]=l.coerce(arg[i+1]) end end end end | |
| return d end | |
| -- ----------------------------------------------------------------------------- | |
| egs["-h"] = function(_) print("\n"..help.."\n") end | |
| egs["-s"] = function(n) math.randomseed(n); the.seed =n end | |
| egs["--the"] = function(_) l.pat(the) end | |
| egs["--guess"] = function(_) main(the.file) end | |
| the = l.settings(help) | |
| math.randomseed(the.seed) | |
| if arg[0]:find"six.lua" then the=l.cli(the,egs) end | |
| return {DATA=DATA, NUM=NUM, SYM=SYM, COLS=COLS, help=help, the=the, l=l} |
| #!/usr/bin/env lua | |
| local the = {leaf=2} | |
| local push, coerce, cells, csv, inc2, cli | |
| local tree, show, read,_tally, _split, _div, _kids | |
| function tree(goal,klass,rows, col,val,op) | |
| local col1, val1, mu = _split(goal,klass, rows) | |
| return {rows=rows, col=col, val=val, op=op, mu= mu, | |
| kids=_kids(goal,klass,rows,col1,val1)} end | |
| function _split(goal,klass, rows, all,good,best,col,val,n,p,s,mu) | |
| all, good, mu = _tally(goal,klass, rows,{},{}) | |
| best = -1 | |
| for c, vals in pairs(good) do | |
| for v, g in pairs(vals) do | |
| n = all[c][v] | |
| p = g/n | |
| s = p*p / (p + (n-g)/n + 1e-32) | |
| if s > best then best, col, val = s, c, v end end end | |
| return col, val, mu end | |
| function _tally(goal,klass,rows,all,good, y) | |
| y = 0 | |
| for _, row in pairs(rows) do | |
| y = y + row[klass] | |
| for c, v in pairs(row) do | |
| if c ~= klass and v ~= "?" then | |
| inc2(all, c, v) | |
| if row[klass] <= goal then inc2(good, c, v) end end end end | |
| return all, good, y/#rows end | |
| function _kids(goal,klass,rows,col1,val1, out) | |
| out = {} | |
| for op, t in pairs(_div(col1, val1, rows)) do | |
| if #t >= the.leaf and #t < #rows then | |
| out[op] = tree(goal,klass, t, col1, val1, op) end end | |
| return out end | |
| function _div(c,v,rows, yes,no) | |
| yes, no = {}, {} | |
| for _, r in pairs(rows) do push((r[c]=="?" or r[c]==v) and yes or no, r) end | |
| return {["=="] = yes, ["!="] = no} end | |
| ---------------------------------------------------------------- | |
| function show(t, lvl) | |
| print(string.format("%s%s (n=%d)", ("|.. "):rep(lvl or 0), | |
| t.col and (t.col.." "..t.op.." "..t.val) or "", #t.rows)) | |
| for _, kid in pairs(t.kids or {}) do | |
| show(kid, (lvl or 0) + 1) end end | |
| function read( rows,klass) | |
| rows = {} | |
| for row in csv() do | |
| if not klass then | |
| print(row) | |
| for i, s in pairs(row) do if s:find"-$" then klass = i end end | |
| else rows[#rows + 1] = row end end | |
| table.sort(rows, function(t,u) return t[klass] < u[klass] end) | |
| return rows[math.floor((#rows)^.5)][klass], klass, rows end | |
| ------------------------------------------------------------------------------- | |
| function push(t,x) t[#t + 1] = x; return x end | |
| function coerce(s) return tonumber(s) or s end | |
| function cells(s, t) | |
| t = {}; for s1 in s:gmatch"([^,]+)" do t[#t + 1] = coerce(s1) end | |
| return t end | |
| function csv() | |
| return function() local s = io.read(); return s and cells(s) end end | |
| function inc2(t,c,v) | |
| t[c]=t[c] or {}; t[c][v]=(t[c][v] or 0)+1; return t[c][v] end | |
| function cli() | |
| for i, s in pairs(arg or {}) do | |
| if the[s:sub(2)] then the[s:sub(2)] = coerce(arg[i+1]) end end end | |
| ------------------------------------------------------------------------------- | |
| if ... == nil then | |
| cli() | |
| local goal,klass,rows = read() | |
| show(tree(goal, klass, rows)) end | |
| return {tree=tree, show=show, read=read} | |
| """ | |
| def mad(rows): | |
| a = [y(row) for row in rows] | |
| median = sorted(a)[len(a) // 2] | |
| return sorted([abs(x - median) for x in a])[len(a) // 2] | |
| def mads(a,b): n1,n2=len(a),len(b); return (n1*mad(a) + n2*mad(b))/(n1+n2) | |
| def key(rows,col,klass): | |
| x = lambda row:row[col] | |
| y = lambda row:row[klass] | |
| rows = sorted([row in rows if x(row) != "?"]), key=x) | |
| most = mad(rows) | |
| b4,min = [],[],-1,len(rows)**.5 | |
| for i in range(len(rows)): | |
| b = int(BINS/(1+e^(-1.704*(mu - x(rows[i])/sd)) | |
| left += [row] | |
| if b != b4 and i>min and len(rows) - i > min: | |
| mad1,mad2 = mads(left),mads(rows[i:]) | |
| if now < most | |
| = | |
| b4 = b | |
| for row in sorted(rows, key=lambda row | |
| function span(t,i, j, u) | |
| u,j = {},j or #t; for k=i,j do u[1+#u] = t[k] end; return u end | |
| function cut(rows,col,klass) | |
| x = function(r) return r[col] end | |
| y = function(r) return r[klass] end | |
| lt = function(r1,r2) return x(r1) < x(r2) end | |
| t = {}; for _,r in pairs(rows) if x(r) != "?" then t[1+#t]=r end end | |
| table.sort(t,lt) | |
| most = sd(t,y) | |
| for i=1,#t do | |
| j=#t-i | |
| b = (the.BINS/(1+math.exp(-1.7404*math.exp((mu- x(t[i]))/sd))))//2 | |
| left[1 + #left] = rows[i] | |
| if b ~= b4 and i>min and j > min: | |
| sd1,sd2 = sd(span(rows,1,i),y), sd(span(rows,i+1))) / (i+j) | |
| now = (i * sd1 + j * sd2) / (i+j) | |
| if now < least then | |
| out={now, {"<=",col,x(rows[i])}, {">",col,x(rows[i+1])} | |
| function sd(row,fn, v,d,n,mu,m2) | |
| n,mu,m2 = 0,0,0 | |
| for i=1,#t do | |
| v = fn(t[i]) | |
| if v~="?" then | |
| n,d = n+1, v-mu | |
| mu = mu+d/n; m2 = m2+d*(x-mu) end end | |
| return n<2 and 0 or (m2/(n-1))^.5 or 0 end |
| import fileinput,random,sys,re | |
| from math import exp,min | |
| from bisect import insort, bisect_left | |
| class o(dict): __getattr__=dict.__getitem__; __setattr__=dict.__setitem__ | |
| the = o(BINS=10,LEAF=20,FILE=None,HEIGHT=3) | |
| # misc details | |
| def is_num(s) : return s[0].isupper() | |
| def is_x(s) : return s[-1] not in "+-!X" | |
| def is_goal(s) : return {"+":1,"-":0}.get(s[-1]) | |
| def selects(split, row): | |
| if (z := row[split.col]) == "?": return True | |
| if split.op == "<" : return z < split.val | |
| if split.op == "<=": return z <= split.val | |
| if split.op == "==": return z == split.val | |
| if split.op == ">=": return z >= split.val | |
| if split.op == ">" : return z > split.val | |
| ## Types are UPPERCASE(). Used only in constructor. | |
| def SPLIT(op, col, val): return o(op=op, col=col, val=val) | |
| def DATA(rows, y, names): return o(rows=rows, y=y, names=names) | |
| def TREE(rows, split, mid, kids): return o(rows=rows, split=split, mid=mid, kids=kids) | |
| def STATS(n, mid, div): return o(n=n, mid=mid, div=div) | |
| # Constructors have Leadingcapitals() and return TYPES | |
| Split = SPLIT # simple constructors are types | |
| def Stats(rows, i): | |
| return STATS(*_stats(sorted(r[i] for r in rows if r[i]!="?"))) | |
| def _stats(s): | |
| n = len(s) | |
| if n>= 4: mid,div = s[n//2], s[3*n//4] - s[n//4] | |
| else: mid,div = (s[-1] - s[0])/2, s[-1] - s[0] | |
| return n, mid, div / 1.349 | |
| def Data(things): | |
| names, ys, rows, lo, hi = None, [], {}, {}, {} | |
| for i, row in enumerate(things): | |
| if i == 0: | |
| names = row | |
| ys = {i:w for i,name in enumerate(names) if (w:=is_goal(name))} | |
| else: | |
| rows += [row] | |
| for j, name in enumerate(names): | |
| if is_num(name) and row[j] != "?": | |
| lo[j] = min(lo.get(j, 1e32), row[j]) | |
| hi[j] = max(hi.get(j, -1e32), row[j]) | |
| def norm(i,n): return (n - lo(i)) / (hi[i] - lo[i] + 1e-32) | |
| def Y(i,n) : return sqrt(sum((norm(i,n) - w)**2 for i,w in ys.items())) | |
| return DATA(rows, Y, names) | |
| def Tree(data, split=None, lvl=0): | |
| return TREE(data.rows, split, | |
| Stats(data.rows, data.y).mid, | |
| list(_kids(data, _splits(data), lvl))) | |
| #----------------------------------------------------------------------------- | |
| def _kids(data, splts, lvl): | |
| if splts and lvl < the.HEIGHT: | |
| for split in splts: | |
| subset = [row for row in data.rows if selects(split, row)] | |
| if the.LEAF <= len(subset) < len(data.rows): | |
| yield Tree(Data(subset, data.y, data.names), split, lvl+1) | |
| def _splits(data): | |
| least, out = 1e32, [] | |
| for col, name in enumerate(data.names): | |
| if is_x(name): | |
| if xys := sorted((r[col],data.y(r)) for r in data.rows if r[col] != "?") | |
| tmp,splts = (_numSplits if is_num(name) else _symSplits)(xys, col) | |
| if splts and tmp < least: | |
| least,out = tmp,splts | |
| return out | |
| def _symSplits(xys,col): | |
| d = {} | |
| for x,y in xys: | |
| d[x] = d.get(x, []) | |
| d[x] += [y] | |
| return sum(len(a)/len(rows) * _stats(sorted(a)).div [-1] for a in d.values()), \ | |
| [Split("==", col, v) for v in d] | |
| #-for get insort etc. get all theys,reviersed. appedl, push. then only resort wwn x~=b4 | |
| def _numSplits(xys,col): | |
| n, b4, best, cuts = len(rows), xys[0][0], 1e32, [] | |
| lo, hi = [], sorted(y for _,y in xys) | |
| for i, (x,y) in enumerate(xys): | |
| insort(lo, hi.pop( bisect_left(hi,y))) # move y hi ==> lo | |
| if x != b4: | |
| if best > (xpect := (i*_stats(lo)[-1] + (n-i)*_stats(hi)[-1]) / n): | |
| best, cuts = xpect, [Split("<", col, x), Split(">=", col, x)] | |
| b4 = x | |
| return best, cuts | |
| def show(tree, lvl=0): | |
| t = tree | |
| cond = f"{t.split.col} {t.split.op} {t.split.val} " if t.split else "" | |
| print(f"{'|.. ' * lvl}{cond} (w= {t.mid}, n={len(t.rows)})") | |
| [show(kid, lvl+1) for kid in t.kids] | |
| def leaf(tree, row): | |
| for k in tree.kids: | |
| if selects(k.split, row): return leaf(k, row) | |
| return tree | |
| #------------------------------------------------------------------------------ | |
| def coerce(x): | |
| for f in (int,float): | |
| try: return f(x) | |
| except: pass | |
| return x | |
| def csv(path): | |
| with fileinput.input(files=path or (), encoding="utf-8") as f: | |
| for s in f: | |
| if (s := re.sub(r"\s+|#.*", "", s)): | |
| yield [coerce(x) for x in re.split(",", s)] | |
| def _discretize(col, rows, i): | |
| s = Stats(rows, i) | |
| lo, bin = {}, lambda z: int(the.BINS/(1 + exp(-1.7 * (z - s.mid)/s.div))) | |
| for n in col: | |
| if n != "?": | |
| lo[bin(n)] = min(lo.get(bin(n), 1e32), n) | |
| return [lo[bin(n)] if n != "?" else "?" for n in col] | |
| def discretized(data): | |
| new = [] | |
| for i, (name, col) in enumerate(zip(data.names, zip(*data.rows))): | |
| if is_num(name): | |
| col = _discretize(col, data.rows, i) | |
| new += [col] | |
| return DATA(list(zip(*new)), data.y, data.names) | |
| #-------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| if len(sys.argv) > 1: | |
| the.FILE = sys.argv[1] | |
| data = read(csv(the.FILE)) | |
| disc = discretized(data) | |
| for row in disc.rows[:10]: | |
| print(row) |
| .\" Manpage for two.lua | |
| .\" Contact [email protected] | |
| .TH TWO.LUA 1 "November 29, 2025" "1.0" "User Commands" | |
| .SH NAME | |
| two.lua \- stochastic incremental Explainable AI (XAI) tool | |
| .SH SYNOPSIS | |
| .B ./two.lua | |
| .RI [ OPTIONS ] | |
| .RI [ COMMANDS ] | |
| .SH DESCRIPTION | |
| .B two.lua | |
| is a Lua script that implements a stochastic incremental Explainable AI (XAI) algorithm. It processes data from CSV files to cluster data into "best" and "rest" sets, calculates distances between rows using weighted Euclidean distance (for attributes) or normalized distance to goals (for class variables), and incrementally updates a model. | |
| .PP | |
| The script handles both numeric and symbolic data types, supports normalization, and includes a test suite for demonstrating various internal statistical functions. | |
| .SH OPTIONS | |
| These options configure the hyperparameters and inputs for the algorithm. They must be provided as key-value pairs or flags. | |
| .TP | |
| .B \-h | |
| Show the help message and exit. | |
| .TP | |
| .BI \-b " bins" | |
| Set the number of bins for discretization. | |
| .IP | |
| Default: 7 | |
| .TP | |
| .BI \-e " era" | |
| Set the frequency (number of rows) at which the model updates. | |
| .IP | |
| Default: 30 | |
| .TP | |
| .BI \-f " file" | |
| Specify the path to the input CSV file. The file should contain a header row. Columns starting with uppercase letters are treated as Numerics, others as Symbols. Columns ending in '+' or '-' are treated as optimization goals (maximize/minimize). Columns ending in 'X' are ignored. | |
| .IP | |
| Default: ../lua6/auto93.csv | |
| .TP | |
| .BI \-r " ruleMax" | |
| Set the maximum number of conditions allowed in a rule. | |
| .IP | |
| Default: 3 | |
| .TP | |
| .BI \-s " seed" | |
| Set the random number generator seed for reproducibility. | |
| .IP | |
| Default: 42 | |
| .SH COMMANDS | |
| The script includes several built-in test hooks and execution modes. These arguments trigger specific functions within the script. | |
| .TP | |
| .B --csv | |
| Read the input file defined by \fB-f\fR and print every parsed row to standard output. | |
| .TP | |
| .B --cut | |
| Demonstrate the list splitting (cutting) function. | |
| .TP | |
| .B --data | |
| Load the data file and display summary statistics for the independent variables (X columns). | |
| .TP | |
| .B --distx | |
| Calculate and display distance statistics between rows based on independent variables (X). | |
| .TP | |
| .B --disty | |
| Calculate and display distance statistics based on dependent optimization goals (Y). | |
| .TP | |
| .B --inc | |
| Demonstrate incremental data updates (adding and removing rows from the statistical model). | |
| .TP | |
| .B --mode | |
| Demonstrate the mode (most frequent item) calculation. | |
| .TP | |
| .B --num | |
| Demonstrate Gaussian random number generation and summary statistics (mean, standard deviation). | |
| .TP | |
| .B --shuffle | |
| Demonstrate the array shuffling function. | |
| .TP | |
| .B --the | |
| Print the current configuration settings (the `the` table). | |
| .TP | |
| .B --two | |
| Run the main stochastic incremental XAI clustering algorithm. This splits the data into training/testing sets, performs clustering, and outputs model performance metrics. | |
| .SH EXAMPLES | |
| .B Run the main XAI algorithm with default settings: | |
| .PP | |
| .nf | |
| ./two.lua --two | |
| .fi | |
| .PP | |
| .B Process a custom dataset with a specific random seed: | |
| .PP | |
| .nf | |
| ./two.lua -f data/weather.csv -s 101 --two | |
| .fi | |
| .PP | |
| .B Inspect the columns of a dataset: | |
| .PP | |
| .nf | |
| ./two.lua -f ../lua6/auto93.csv --data | |
| .fi | |
| .SH EXIT STATUS | |
| Returns 0 on successful execution. | |
| .SH AUTHOR | |
| Tim Menzies ([email protected]) | |
| .SH LICENSE | |
| MIT License (mit-license.org) | |
| .SH COPYRIGHT | |
| Copyright (c) 2025 Tim Menzies. |
| #!/usr/bin/env lua | |
| -- __ | |
| -- /\ \__ | |
| -- \ \ ,_\ __ __ __ ___ | |
| -- \ \ \/ /\ \/\ \/\ \ / __`\ | |
| -- \ \ \_ \ \ \_/ \_/ \/\ \L\ \ | |
| -- \ \__\ \ \___x___/'\ \____/ | |
| -- \/__/ \/__//__/ \/___/ | |
| local help = [[ | |
| two.lua : stochastic incremental XAI | |
| (c) 2025, Tim Menzies, [email protected], mit-license.org | |
| Options: | |
| -h Show help. | |
| -b bins=7 Number of bins for discretization. | |
| -e era=30 Update model every `era` number of rows. | |
| -r ruleMax=3 Max conditions in a rule. | |
| -s seed=42 Random number seed. | |
| -f file=../lua6/auto93.csv ]] | |
| -- coerce(s) --> v ;; Return int or float or bool or string from `s`. | |
| local function coerce(s) | |
| if s then return tonumber(s) or s:match'^%s*(.-)%s*$' end end | |
| local the={}; for k,v in help:gmatch("(%S+)=(%S+)") do the[k] = coerce(v) end | |
| math.randomseed(the.seed) | |
| local DATA, NUM, SYM, COLS, clone, adds | |
| -- | o |_ | |
| -- | | |_) | |
| local abs,exp,sqrt,log = math.abs, math.exp, math.sqrt, math.log | |
| local max,rand,cos = math.max, math.random, math.cos | |
| local say=io.write | |
| local fmt = string.format | |
| -- sort(t,f) --> t ;; Sort `t` using function `f`. | |
| local sort = function(t,f) table.sort(t,f); return t end | |
| -- lt(f) --> f ;; Return a function that sorts `a` and `b` on `f`. | |
| local lt = function(f) return function(a,b) return f(a) < f(b) end end | |
| -- cat(a) --> s ;; Return a string representation of array `a`. | |
| local cat = function(a) return "{".. table.concat(a," ") .."}" end | |
| -- o(v) --> s ;; Return a string representation of `v`. | |
| local function o(v, list,dict) | |
| list = function(a, u) | |
| for _,v in ipairs(a) do u[1+#u] = o(v) end; return cat(u) end | |
| dict = function(d, u) | |
| for k,v in pairs(d) do u[1+#u] = fmt(":%s %s", k, o(v)) end | |
| return cat(sort(u)) end | |
| return type(v) == "number" and fmt(v%1==0 and "%.0f" or "%.3f", v) or | |
| type(v) ~= "table" and tostring(v) or (#v>0 and list or dict)(v,{}) end | |
| -- s2a(s) --> a ;; Return array of words from string `s`, split on ",". | |
| local function s2a(s, a) | |
| a={}; for s1 in s:gmatch"([^,]+)" do a[1+#a] = coerce(s1) end; return a end | |
| -- csv(file) --> f ;; Iterator that returns rows from `file`. | |
| local function csv(file, src) | |
| src = assert(io.open(file)) | |
| return function( s) | |
| s = src:read(); if s then return s2a(s) else src:close() end end end | |
| -- shuffle(t) --> t ;; Randomly shuffle the order of elements in `t`. | |
| local shuffle = function(t, n) | |
| for m=#t,2,-1 do n=math.random(m); t[m],t[n]=t[n],t[m] end; return t end | |
| -- cut(a0,n,data) --> t,t ;;Split `a0` at `n` (if `data` exists,split that too). | |
| local function cut(a0,n, data) | |
| local a1,a2 = {},{} | |
| for j,v in ipairs(a0) do if j <= n then a1[1+#a1]=v else a2[1+#a2]=v end end | |
| return data and clone(data,a1),clone(data,a2) or a1,a2 end | |
| -- mode(d) --> v ;; Return the most frequent key in `d`. | |
| local function mode(d, v,n) | |
| v,n = nil,0 | |
| for v1,n1 in pairs(d) do if n1>n then v,n=v1,n1 end end | |
| return v end | |
| -- box_muller(mu,sd) --> n ;; Return a random number from a Gaussian `mu`,`sd`. | |
| function box_muller(mu,sd) | |
| return mu + sd * sqrt(-2 * log(rand())) * cos(6.28 * rand()) end | |
| -- _ | _. _ _ _ _ | |
| -- (_ | (_| _> _> (/_ _> | |
| -- DATA(src) --> DATA ;; Create a new DATA, populated with `src`. | |
| function DATA( src) return adds(src, {n=0,rows={},cols=nil}) end | |
| -- clone(i,src) --> DATA ;; Return a new DATA with same structure as `i`. | |
| function clone(i, src) return adds(src, DATA{i.cols.names}) end | |
| -- NUM(at,s) --> NUM ;; Create a NUM object to summarize numbers. | |
| function NUM(at,s) | |
| return {at=at or 0, of=s, n=0, mu=0, m2=0, sd=0, | |
| best=(tostring(s) or ""):find"+$" and 1 or 0} end | |
| -- SYM(at,s) --> SYM ;; Create a SYM object to summarize symbols. | |
| function SYM(at,s) return {at=at, of=s, n=0, has={}} end | |
| -- COLS(row) --> COLS ;; Create a COLS object from a list of column names. | |
| function COLS(row, t,x,y,all,col) | |
| x,y,all = {},{},{} | |
| for n,s in ipairs(row) do | |
| col = (s:match"^[A-Z]" and NUM or SYM)(n,s) | |
| all[n] = col | |
| if not s:match"X$" then | |
| t = s:find"[+-]$" and y or x | |
| t[1+#t] = col end end | |
| return {all=all, x=x, y=y, names=row} end | |
| -- ._ _ _ _|_ |_ _ _| _ | |
| -- | | | (/_ |_ | | (_) (_| _> | |
| -- add(i,v,inc) --> v ;; Update `i` with `v` (incrementing by `inc`). | |
| local function add(i,v, inc) | |
| if v == "?" then return v end | |
| inc = inc or 1 | |
| i.n = i.n + inc | |
| if i.has then i.has[v] = inc + (i.has[v] or 0) | |
| elseif i.mu then | |
| if inc < 0 and i.n < 2 then i.sd, i.m2, i.mu, i.n = 0,0,0,0 else | |
| local d = v - i.mu | |
| i.mu = i.mu + inc * d / i.n | |
| i.m2 = i.m2 + inc * d * (v - i.mu) | |
| i.sd = i.n<2 and 0 or sqrt((max(0,i.m2)/(i.n - 1))) end | |
| elseif i.rows then | |
| if not i.cols then i.cols = COLS(v) else | |
| i._mid = nil | |
| for _,col in pairs(i.cols.all) do add(col, v[col.at], inc) end | |
| if inc > 0 then i.rows[1 + #i.rows] = v end end end | |
| return v end | |
| -- sub(i,v) --> v ;; Decrement `v` from `i`. | |
| local function sub(i,v) return add(i,v,-1) end | |
| -- adds(src,it) --> it ;; Update `it` with all items from `src`. | |
| function adds(src, it) | |
| it = it or NUM() | |
| if type(src) == "string" | |
| then for row in csv(src) do add(it,row) end | |
| else for _,row in pairs(src or {}) do add(it,row) end end | |
| return it end | |
| -- mid(i) --> v|row ;; Return central tendency of `i`. | |
| local function mid(i)--> a | v;; Exepcted value for `i`. | |
| if i.mu then return i.mu | |
| elseif i.has then return mode(i.has) | |
| elseif i.rows then | |
| if not i._mid then | |
| local t={}; for _,col in pairs(i.cols.all) do t[1+#t] = mid(col) end | |
| i._mid = t end | |
| return i._mid end end | |
| -- norm(i,v) --> n ;; Normalize `v` 0..1 using `i`. | |
| local function norm(i,v) | |
| return (i.has or v=="?") and v | |
| or 1/(1 + math.exp(-1.7 * (v - i.mu)/(i.sd + 1e-32))) end | |
| -- aha(col,v1,v2) --> n ;; Return distance between `v1` and `v2`. | |
| local function aha(col,v1,v2) | |
| if v1=="?" and v2=="?" then return 1 end | |
| if col.has then return v1==v2 and 0 or 1 end | |
| v1,v2 = norm(col,v1), norm(col,v2) | |
| v1 = v1 ~= "?" and v1 or (v2 > 0.5 and 0 or 1) | |
| v2 = v2 ~= "?" and v2 or (v1 > 0.5 and 0 or 1) | |
| return abs(v1 - v2) end | |
| -- distx(i,row1,row2) --> n ;; Return distance `row1` to `row2` (using X cols). | |
| local function distx(i,row1,row2, d) | |
| d=0; for _,x in pairs(i.cols.x) do d= d + aha(x, row1[x.at],row2[x.at])^2 end | |
| return sqrt(d/#i.cols.x) end | |
| -- disty(i,row) --> n ;; Return distance of `row` to best goal (using Y cols). | |
| local function disty(i,row, d) | |
| d=0; for _,y in pairs(i.cols.y) do d= d + (norm(y, row[y.at]) - y.best)^2 end | |
| return sqrt(d/#i.cols.y) end | |
| -- distys(i,rows) --> rows ;; Sort `rows` by their distance to heaven. | |
| local function distys(i, rows, y) | |
| y = function(row) return disty(i, row) end | |
| return sort(rows or i.rows, function(r1,r2) return y(r1) < y(r2) end) end | |
| -- _|_ |_ o ._ | | |
| -- |_ | | | | | |< | |
| -- two(data) --> t ;; Incrementally cluster `data` into `best` and `rest`. | |
| local function two(data) | |
| local train,test,start,todo,seen,best,rest,d | |
| shuffle(data.rows) | |
| train,test = cut(data.rows, data.n//2) | |
| start,todo = cut(train, 4) | |
| seen = clone(data, start) | |
| best,rest = cut(distys(seen),2,data) | |
| d = function(row,what) return distx(seen, row, mid(what)) end | |
| for n,row in pairs(todo) do | |
| if n>256 then break end; --say(".") | |
| if d(row,best) < d(row,rest) then | |
| add(seen, add(best, row)) ; --say(best.n) | |
| if best.n > sqrt(seen.n) then -- print("-") | |
| add(rest, sub(best, table.remove( distys(best)))) end end end | |
| distys(best) | |
| return {best=best, rest=rest, seen=seen, test=test, | |
| model = lt(function(row) return d(row,best) - d(row,rest) end)} end | |
| -- _| _ ._ _ _ _ | |
| -- (_| (/_ | | | (_) _> | |
| local egs={} | |
| egs["-h"] = function(_) print("\n"..help.."\n") end | |
| egs["-s"] = function(n) math.randomseed(n); the.seed =n end | |
| egs["--the"] = function(_) print(o(the)) end | |
| egs["--csv"] = function(_) for row in csv(the.file) do print(o(row)) end end | |
| egs["--shuffle"] = function(_) print(o(shuffle{10,20,30,40,50})) end | |
| egs["--mode"] = function(_) print(mode{d=2,f=10,g=1}) end | |
| egs["--cut"] = function(_, b,c) | |
| b,c=cut({10,20,30,40,50},2); print(o(b),o(c)) | |
| for _ =1,100 do b,c=cut({10,20,30,40,50},2) end end | |
| egs["--num"] = function(_,num) | |
| num=NUM() | |
| for _=1,1000 do add(num, box_muller(10,5)) end | |
| print(fmt("%.3f %.3f", num.mu, num.sd)) end | |
| egs["--data"] = function(_) | |
| for n,col in pairs(DATA(the.file).cols.x) do | |
| print(n,o(col)) end end | |
| egs["--distx"]= function(_, data,t,u) | |
| data = DATA(the.file) | |
| t,rows = {}, shuffle(data.rows) | |
| for n = 2,#rows do t[1+#t] = distx(data,rows[n-1],rows[n]) end | |
| print(o(sort(t))) end | |
| egs["--disty"]= function(_, data,num) | |
| data,t = DATA(the.file), {} | |
| distys(data) | |
| for n,row in pairs(data.rows) do t[n]=disty(data,row) end | |
| print(o(t)) end | |
| egs["--inc"] = function(_, data1,data2) | |
| data1 = DATA(the.file) | |
| print(o(mid(data1))) | |
| data2 = clone(data1) | |
| for _,row in pairs(data1.rows) do | |
| add(data2,row) | |
| if data2.n==50 then print(o(mid(data2))) end end | |
| while data2.rows do | |
| sub(data2, table.remove(data2.rows)) | |
| if data2.n==50 then print(o(mid(data2)));break end end end | |
| egs["--two"] = function(_, data,out,t) | |
| t,data = {}, DATA(the.file) | |
| for _=1,20 do | |
| out = two(data) | |
| t[1+#t] = (100*disty(out.seen, sort(out.test, out.model)[1]))//1 end | |
| print(o(sort(t))) end | |
| -- cli(d,funs) --> nil ;; Update `d` with flags from command-line; run `funs`. | |
| local function cli(d,funs) | |
| for i,s in pairs(arg) do | |
| if funs[s] | |
| then funs[s](coerce(arg[i+1])) | |
| else for k,_ in pairs(d) do | |
| if k:sub(1,1)==s:sub(2) then d[k]=coerce(arg[i+1]) end end end end end | |
| if arg[0]:find"two.lua" then cli(the,egs) end |
| .\" Manpage for two.lua data format | |
| .\" Contact [email protected] | |
| .TH TWO_DATA 5 "November 29, 2025" "1.0" "File Formats" | |
| .SH NAME | |
| two_data \- input CSV format for the two.lua XAI tool | |
| .SH DESCRIPTION | |
| The \fBtwo.lua\fR script processes data stored in comma-separated value (CSV) files. The format relies heavily on a structured header row to define variable types, optimization goals, and ignored columns. | |
| .PP | |
| The file must be plain text, with rows separated by newlines and columns separated by commas. | |
| .SH HEADER SYNTAX | |
| The first row of the file is crucial. It defines the schema. The naming convention of the column headers tells the script how to treat the data in that column. | |
| .SS Data Types | |
| .TP | |
| .B Uppercase Start (e.g., \fIAge\fR, \fISalary\fR) | |
| Columns where the header name starts with an uppercase letter are treated as \fBNUM\fR (Numeric). The script will calculate mean and standard deviation for these. | |
| .TP | |
| .B Lowercase Start (e.g., \fIjob\fR, \fIrace\fR) | |
| Columns where the header name starts with a lowercase letter are treated as \fBSYM\fR (Symbolic). The script will calculate mode and entropy for these. | |
| .SS Column Roles | |
| .TP | |
| .B Suffix '+' (e.g., \fIClass+\fR, \fIAcc+\fR) | |
| Indicates a dependent variable (target) that should be \fBMAXIMIZED\fR. These columns constitute the "Y" (goal) variables. | |
| .TP | |
| .B Suffix '-' (e.g., \fILbs-\fR, \fICost-\fR) | |
| Indicates a dependent variable (target) that should be \fBMINIMIZED\fR. These columns constitute the "Y" (goal) variables. | |
| .TP | |
| .B Suffix 'X' (e.g., \fIidX\fR, \fIDateX\fR) | |
| Indicates a column that should be \fBIGNORED\fR entirely. These are often used for unique identifiers, comments, or raw data not suitable for clustering. | |
| .TP | |
| .B No Suffix | |
| Any column without a '+', '-', or 'X' suffix is treated as an independent variable ("X" variable) used for clustering and distance calculations. | |
| .SH DATA FORMAT | |
| .TP | |
| .B Numeric Values | |
| Standard integer or floating-point numbers. | |
| .TP | |
| .B Symbolic Values | |
| String identifiers. Note that the script splits strictly on commas, so strings containing commas may cause parsing errors. | |
| .TP | |
| .B Missing Values | |
| Missing data should be represented by the question mark character (\fB?\fR). The script contains logic to handle these during distance calculations (assumed max distance). | |
| .SH EXAMPLES | |
| .PP | |
| .B Example 1: A simple optimization dataset | |
| .PP | |
| In this example: | |
| .br | |
| - \fInameX\fR is ignored (ends in X). | |
| .br | |
| - \fIAge\fR is numeric independent (starts Upper). | |
| .br | |
| - \fIjob\fR is symbolic independent (starts lower). | |
| .br | |
| - \fISalary+\fR is a numeric goal to maximize. | |
| .PP | |
| .nf | |
| nameX,Age,job,Salary+ | |
| 1,25,engineer,50000 | |
| 2,30,doctor,90000 | |
| 3,?,artist,40000 | |
| .fi | |
| .PP | |
| .B Example 2: Multi-objective car selection | |
| .PP | |
| In this example: | |
| .br | |
| - \fIClndrs\fR, \fIVol\fR, \fIHp\fR are numeric inputs. | |
| .br | |
| - \fIorigin\fR is a symbolic input. | |
| .br | |
| - \fILbs-\fR is a goal to minimize (weight). | |
| .br | |
| - \fIAcc+\fR is a goal to maximize (acceleration). | |
| .PP | |
| .nf | |
| Clndrs,Vol,Hp,origin,Lbs-,Acc+ | |
| 8,304,193,1,4732,18.5 | |
| 8,360,215,1,4615,14 | |
| 4,97,52,3,2130,24.6 | |
| .fi | |
| .SH SEE ALSO | |
| .BR two.lua (1) |
Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
Bash is a powerful shell for automation and scripting. While famous for command-line one-liners, it's equally adept at building development environments, managing workflows, and creating portable init scripts.
Why Bash? Because it's everywhere. Pre-installed on virtually every Unix-like system, bash is the language that proves ubiquity is power. It works today, it worked 20 years ago, it will work 20 years from now.
Bash has its limits, sure: quirky syntax, easy to make subtle errors, limited data structures. That said, bash is superb for gluing tools together, managing environments, and creating scripts that just work across systems without dependencies.
1. The Shebang
First line declares the interpreter. Makes scripts executable.
#!/usr/bin/env bashUse env to find bash in PATH rather than hardcoding /bin/bash.
2. Checking Dependencies
Verify required tools exist before running script.
WANT="git nvim gawk tree figlet"
for want in $WANT; do
command -v "$want" &>/dev/null || echo "Warning: $want is NOT installed."
doneUses command -v (POSIX-compliant) instead of which.
3. Silencing Warnings
Export variables to control environment behavior.
export BASH_SILENCE_DEPRECATION_WARNING=1From ell. Prevents macOS from nagging about zsh.
4. Source vs Execute Detection
Script behaves differently when sourced vs executed.
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
# Executed directly
exec bash --init-file "${BASH_SOURCE[0]}" -i
fiBASH_SOURCE[0] is script path, $0 is invocation name.
5. Getting Script Directory
Reliably find where script lives, regardless of invocation.
Here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"From ell. Critical for scripts that load resources.
6. Modifying PATH
Add script directory to PATH for running local tools.
export PATH="$Here:$PATH"Prepend (not append) so local versions override system ones.
7. Terminal Colors with tput
Portable color codes using terminfo database.
bold=$(tput bold)
col0=$(tput sgr0) # reset
col1=$(tput setaf 6) # cyan
col2=$(tput setaf 3) # yellowPrefer tput over ANSI codes for portability.
8. Here Documents
Multi-line strings embedded in script.
hi() {
clear
echo "${col1}"
cat<<'EOF'
_____ _ _
| ___| | | |
| |__ | | | |
EOF
echo "${col0}"
}Quote 'EOF' to prevent variable expansion in heredoc.
9. Dynamic Prompt with Functions
Build prompt from function output evaluated on each display.
branch() { git branch 2>/dev/null | awk '/^\*/ {print $2}'; }
dirty() { [[ -n $(git status -s 2>/dev/null) ]] && echo "*"; }
PROMPT_COMMAND='PS1="${bold}${col1}$(basename "$PWD")${col0} ${col2}$(branch)$(dirty)${col0} ▶ "'PROMPT_COMMAND runs before each prompt display.
10. Showing Parent and Current Directory
Compact path display showing context.
$(basename "$(dirname "$PWD")")/$(basename "$PWD")Shows parent/current instead of full path.
11. Basic Aliases
Short names for common commands.
alias Q='exit'
alias l='ls -lh'
alias la='ls -la'Use single quotes to prevent early expansion.
12. Colorized Tool Defaults
Override commands with better defaults.
alias ls="\ls --color"
alias grep='grep --color=auto'Backslash \ls prevents alias recursion.
13. Reload Function
Source script again to apply changes.
alias reload="source '$Here/ell' && echo ✅"Double quotes allow $Here expansion at definition time.
14. History Size
Store more commands for better recall.
export HISTSIZE=10000
export HISTFILESIZE=20000HISTSIZE is in-memory, HISTFILESIZE is on-disk.
15. Deduplication
Remove duplicate commands from history.
export HISTCONTROL=ignoredups:erasedupsignoredups = consecutive dups, erasedups = all dups.
16. History Options
Improve multi-line command handling.
shopt -s histappend # Append, don't overwrite
shopt -s cmdhist # Multi-line as one entryshopt -s sets shell options.
17. Simple Functions
Functions create reusable commands.
mkcd() {
mkdir -p "$1" && cd "$1"
}&& ensures cd only if mkdir succeeds.
18. Wrapping External Tools
Create configured versions of existing commands.
vi() {
nvim --clean \
--cmd "set number relativenumber" \
--cmd "set mouse=a clipboard=unnamedplus" \
"$@"
}"$@" passes all arguments to wrapped command.
19. Piping to Functions
Functions can process stdin.
plot() {
plot -p -e 'plot "-"'
}The - tells gnuplot to read from stdin.
20. Double Bracket Tests
Modern bash conditionals with better syntax.
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
exec bash --init-file "${BASH_SOURCE[0]}" -i
fi[[ ]] supports ==, &&, ||, no quote requirements.
21. Testing for Non-Empty Strings
Check if command produced output.
[[ -n $(git status -s 2>/dev/null) ]] && echo "*"-n tests for non-empty string.
22. Stderr Redirection
Send errors to /dev/null to suppress warnings.
command -v "$want" &>/dev/null || echo "Warning"&> redirects both stdout and stderr.
23. Filtering Output
Pipe through grep to show only relevant lines.
check() {
ruff check "$1" 2>&1 | grep -v "All checks passed"
pyright "$1" 2>&1 | grep -E "^\s+.*:\d+:\d+"
}2>&1 redirects stderr to stdout for piping.
24. Temporary Directories
Create and auto-cleanup temp space.
_TMP=$(mktemp -d)
trap "rm -rf '$_TMP'" EXIT INT TERMtrap ensures cleanup on exit or interrupt.
25. Command Substitution in Prompts
Execute commands in prompt strings.
PS1="${bold}${col1}\$(basename \"\$PWD\")${col0} ▶ "Escape $ with \$ to defer evaluation until prompt display.
[[ ]] over [ ] - Modern syntax, fewer gotchas"$var" not $var (prevents word splitting)$() over backticks - Clearer, nestablecommand || handle_errorexport VAR=value for child processestput for colors - More portable than ANSI codesCopyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
gawk (GNU AWK) is a powerful tool for text processing and data transformation. While famous for one-liners it's equally adept at building data pipelines and (as seen here) implementing machine learning classifiers.a
Why gawk (GNU AWK)? Because easy is not wrong. Ubiquitous and stable, gawk is the language that proves simplicity is not weakness. gawk has its limits, sure, such as only associative arrays for data structures; functions return only strings or numbers; untyped variables, interpreted only (no complier). That said, AWK is a superb language for testing ideas especially where the problem can be broken into chunks which can streamed as part of a pipe.
brew install gawk # macOS
winget install gawk # windows
sudo apt install gawk # Linux (Debian/Ubuntu)
sudo dnf install gawk # Linux (Red Hat/Fedora)# Print second column from CSV
gawk -F',' '{print $2}' data.csv
# Sum numbers in first column
gawk '{sum+=$1} END {print sum}' numbers.txt
# Count lines matching pattern
gawk '/error/ {count++} END {print count}' log.txt
# Print lines 10-20
gawk 'NR>=10 && NR<=20' file.txt#!/usr/bin/env gawk -fMakes script executable. Use chmod +x script.awk then run as ./script.awk data.txt
BEGIN {
FS=","; BINS=7; CONVFMT = "%.2f"
}Runs once before processing any input. Set up variables, counters, constants. From bins script.
END {
report()
}Runs once after all input processed. Perfect for final reports, summaries. All three scripts use this.
NR==1 { head(); print } # Runs only on first line
NR>1 { body() } # Runs on all lines after first
/^[A-Z]/ { print } # Runs when line starts with uppercaseIf pattern matches, action executes. No pattern = always run. No action = print line.
BEGIN { FS="," } # Split on comma (CSV)
BEGIN { FS=" " } # Split on space (default)
BEGIN { FS="\t" } # Split on tab (TSV)Defines how AWK splits each line into fields. All three scripts use FS=",".
NF # Number of Fields in current line
NR # Number of Records (line number)
for(i=1; i<=NF; i++) # Loop through all fields
if (NR > WAIT+1) # Skip first 20 lines (nbc)$1 # First field (actual class in nbc)
$i # Field at position i
$NF # Last field
$0 # Entire line
print $2, $4 # Print 2nd and 4th fieldsIn nbc: $1 == name checks if first field matches input.
cnt[i] = 5 # Simple key-value
nk[actual]++ # Count classes (nbc)
words["hello"] = 42 # String keysNo need to declare. Keys can be strings or numbers.
cf[kl]["tp"]++ # 2D: confusion matrix (abcd)
freq[i][$i][actual]++ # 3D: column/value/class (nbc)
bmin[i][b] = v # 2D: bin minimums (bins)AWK simulates multi-dimensional arrays using concatenated keys with SUBSEP.
gsub(/[ \t\r]/,"") # Remove whitespace globally
gsub("old", "new", var) # Replace all in variable
sub("old", "new", var) # Replace first occurrence onlyAll scripts use gsub(/[ \t\r]/,"") to clean input.
if ($i ~ /^[A-Z]/) # Matches: starts with uppercase (bins)
if ($i !~ /[-+!]$/) # Not match: doesn't end with -+! (bins)
/^fo+bar$/ { print } # Pattern: one or more 'o'
$i ~ /[!]$/ # Find class column (nbc)function seen(i,v) {
cnt[i] += 1
return v
}
function div(a,b) {
return int(100*a/(b+1E-32))
}From abcd and bins. Define reusable logic.
function seen(i,v, d) { # d is local (note extra spaces)
d = v - mu[i] # Welford's algorithm in bins
mu[i] += d/cnt[i]
return v
}Everything is global except function parameters. Add extra params after whitespace for locals.
if (!(want in cf)) # Check if key exists (abcd)
cf[want]["tn"] = total
if ("foo" in assoc) # Test array membership
for (k in nk) # Iterate over keysprintf "..." | "sort -n" # Pipe to shell command (abcd)
print data | "sort -r" # Reverse sortAWK can pipe output to any shell command.
sqrt(m2[i]/(cnt[i]-1)) # Square root - std dev (bins)
log((nk[k] + K) / (NR-1)) # Natural log (nbc)
exp(-1.704 * (v - mu[i])) # Exponential (bins)
int(BINS / (1 + ...)) # Integer conversion (bins)printf "%5d %5d %5d", a, b, c # Fixed-width integers (abcd)
printf "%.2f", value # 2 decimal places
CONVFMT = "%.2f" # Default conversion format (bins)sd[i] = cnt[i] < 2 ? 0 : sqrt(...) # Condition ? true : false (bins)
best = best ? best : k # Set default value (nbc)cnt[i] += 1 # Add and assign (bins)
nk[actual]++ # Increment by 1 (nbc)
cf[kl]["tp"]+=(got==want) # Add boolean result (abcd)for(i=1; i<=NF; i++) # C-style: iterate fields
for(k in nk) # For-in: iterate keys (nbc)
for(r in row) # Iterate rows (bins)count[x]++ # Uninitialized = 0
sum += value # No need for sum=0 first
nk[actual]++ # Works immediately (nbc)All variables start at 0 (numeric) or "" (string context).
v += 0 # Force string to number (bins)
if (v != "?") # String comparisonThe += 0 idiom ensures numeric context.
cf[kl]["tp"]+=(got==want) # Boolean → 1 or 0 (abcd)
cf[kl]["fn"]+=(got!=want) # False = 0, True = 1Comparison operators return 0 or 1, usable in arithmetic.
s = s sep bin(i, row[r][i]) # No operator needed (bins)
sep = "," # Just place strings adjacent
print "Hello" " " "World" # → "Hello World"1E32 # Large number (infinity proxy)
1E-32 # Small number (epsilon)
hi[i] = -(lo[i] = 1E32) # Initialize bounds (bins)hi[i] = -(lo[i] = 1E32) # lo[i]=1E32 returns 1E32
if (n = split(...)) # Assign and testAssignments are expressions that return the assigned value.
div(a, b+1E-32) # Avoid /0 (abcd)
sd[i] + 1E-32 # Epsilon guard (bins)length(freq[i]) # Number of keys (nbc)
length(nk) # Count classes (nbc)Returns number of elements in associative array.
if ((v!="?") && (i in hi)) # && stops if first is false (bins)
a || b # || stops if first is truerow[NR-1][i] = seen(i,$i) # Build custom structure (bins)Store fields in your own arrays for later processing.
{ gsub(/[ \t\r]/,"") } # Applies to ALL linesImplicit pattern (always true) modifies every line.
BEGIN { FS=","; BINS=7; CONVFMT = "%.2f" }Semicolons separate statements on same line.
"5" + 3 # → 8 (string becomes number)
x = 5; print x "" # → "5" (number becomes string)AWK automatically converts based on context.
delete ARGV[1] # Remove one element
delete array # Delete entire arrayFree memory or remove unwanted elements.
ARGC # Number of command-line args
ARGV[n] # Command-line arguments
FILENAME # Current input filename
OFS # Output field separatorgetline name < "/dev/stdin" # Read from stdin (nbc example)
getline var < "file.txt" # Read from file
"cmd" | getline var # Read from command
getline # Read next line into $0system("echo foobar") # Execute shell command
system("date") # Run external programsReturns exit status of command.
f = count ^ 2 # Power operator
f ^= 2 # Compound assignmenttolower("HELLO") # → "hello"
toupper("hello") # → "HELLO"Useful for case-insensitive comparisons.
substr("foobar", 2, 3) # → "oob" (start, length)
substr("foobar", 4) # → "bar" (start to end)
match(str, /regex/) # → position of match (or 0)
split("a:b:c", arr, ":") # Split into arraySee bins, nbc, and abcd scripts for real-world examples using these techniques:
# Discretize then classify then evaluate
bins < diabetes.csv | nbc | abcdEach script demonstrates multiple techniques working together to solve classification problems efficiently.
Copyright (c) 2025 Tim Menzies, MIT License
https://opensource.org/licenses/MIT
make is a powerful tool for automating tasks. While famous for
compiling code, it's just as useful for managing data science
pipelines, documentation, and (as seen in this Gist) running script
examples. It manages dependencies and executes commands, saving you
from re-running entire workflows when only one part has changed.
This document uses the makefile from this
Gist as
its primary source of examples.
makefile (The source for all examples below)make is pre-installed on virtually all Linux and macOS systems. You can check by running:
make --versionFor Windows, make is available through tools like Windows Subsystem for Linux (WSL), Git Bash, or by installing it with a package manager like Chocolatey (choco install make).
# Run the default goal (in this case, 'help')
make
# Run the "test" recipe to execute all examples
make test
# Run a single, specific task
make eg-abcd
# "Dry run" - show commands without executing them
make -n eg-abcdThe fundamental syntax. make will ensure dependency is up-to-date
before running the recipe for target.
tmp/out.log: data.csv
cat data.csv | classify > tmp/out.log # code to converts target to dependentNote that the first time we run make tmp/out.log, the target is generated.
And if we run it again, nothing happens unless $(Data) is updated.
The indented shell commands to run for a target. Note: You must use a Tab, not spaces.
ok:
chmod +x nbc abcd binsLines starting with # are ignored, perfect for documentation.
# vim: ts=2 sw=2 noetIf you just type make, it runs the first target in the file. In this makefile, the first target is help.
help: ## show this help
...Declares that a target does not create an actual file. This prevents
make from getting confused if a file with the same name (e.g.,
test) exists.
.PHONY: help egs ok eg-nbc eg-abcd eg-soybean eg-globals pull pushPhony targets act as convenient, memorable names for running complex commands.
test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: nowTargets can depend on other targets, creating a chain of operations.
Here, test depends on eg-nbc, which in turn depends on ok.
make will automatically run them in the correct order: ok, then
eg-nbc, then test.
test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now
eg-nbc: ok ## run naive bayes classifier
ok:; chmod +x nbc abcd binsThe value is computed once at the point of definition and stored.
CYAN := \033[1;36mThe value is a reference that is expanded every time the variable is used.
cdata=~/gits/timm/moot/classify/diabetes.csvReference variables using $(...) or ${...}.
hi=@echo -e "\n--------- $(YELLOW)$@$(RESET)"Specifies which shell to use for running recipes.
SHELL=/bin/bashAn internal make variable that holds the path to the current working directory.
export PATH := $(CURDIR):$(PATH)Refers to the name of the target being executed. In the example,
$@ will be eg-nbc when make eg-nbc is run.
hi=@echo -e "\n--------- $(YELLOW)$@$(RESET)"
eg-nbc: ok ## run naive bayes classifier
$(hi); cat $(cdata) | bins | nbc | sort -n | uniq -cA list of all makefiles that were read. The help target cleverly uses this to read its own source code.
... 'BEGIN { FS=":.*## "; ... }' $(MAKEFILE_LIST)Prefixing a command with @ prevents make from printing the command to the console before running it.
@echo -e "\n$(CYAN)$$(cat bird.txt)$(RESET)\n"A special target that tells make not to print any commands before running them. This is why you only see the output of the scripts in this Gist.
.SILENT:It's common practice to use the $(MAKE) variable to call make from within make. This preserves settings and options.
eg-soybean: ## run classifier on soybean
$(hi); $(MAKE) cdata=$(soybean) eg-abcdYou can change a variable's value from the command line. The eg-soybean target relies on this, calling eg-abcd but overriding the cdata variable for that one run.
# This is what the 'eg-soybean' target effectively runs:
make eg-abcd cdata=~/gits/timm/moot/classify/soybean.csvA common (but not built-in) convention. By putting ## after a target, you create a help comment that a help target can automatically parse and print.
test: eg-nbc eg-abcd eg-soybean eg-globals ## run all egs: now
eg-nbc: ok ## run naive bayes classifier
eg-abcd: ok ## run classifier with confusion matrixThis gold-standard help target combines gawk, $(MAKEFILE_LIST), and the ## convention to automatically generate a help menu from the comments in the file itself.
CYAN := \033[1;36m
YELLOW := \033[1;33m
GREEN := \033[1;32m
RESET := \033[0m
help: ## show this help
gawk 'BEGIN { FS=":.*?## "; \
printf "\n%smake%s [%soptions%s]:\n\n", \
"$(CYAN)", "$(RESET)", "$(YELLOW)", "$(RESET)"} \
NF==2 && $$1~/^[a-z0-9A-Z_-]+/ \
{ printf " %s%-15s%s %s\n", \
"$(GREEN)", $$1, "$(RESET)", $$2}' $(MAKEFILE_LIST)This code parses a Makefile to generate a help menu:
Todo
Now
Later
cat data | era | bins | bestrest | tree# tree test = check=5