Skip to content

Instantly share code, notes, and snippets.

@timm
Last active November 23, 2025 17:26
Show Gist options
  • Select an option

  • Save timm/46677cfb8ec39dd1acca44078f763109 to your computer and use it in GitHub Desktop.

Select an option

Save timm/46677cfb8ec39dd1acca44078f763109 to your computer and use it in GitHub Desktop.
data mining for multi-objective optimization

Purpose License Python Data Run Author

Moot0

image

(For quick access to this material, use http://tiny.cc/moot0.)

Tiny example data mining to handle multi-objective optimization tasks. Offers a very simple (and fast) baseline method against which other methods could be reviewed.

AI doesn’t need to be massive — it needs to be clearer. The code is about simple AI tools, refined over decades, can do much of what the big models do (*), only faster, cheaper, and with purpose you can actually explain. So this code is about reverse-engineering AI: stripping away the hype, finding the essentials, and building systems that last — tools that still work when frameworks fade and corporations move on.

(*) For classification, regression, optimization. Generation, on the other hand, is a different matter.

Install

 # suggestion: install data near root
 git clone http://github.com/timm/moot ~/gits/moot # <-- data
 # install the code anywhere 
 git clone http://tiny.cc/moot0 moot0         # <-- code
 cd moot0

Test data

http://github.com/timm/moot

Contents

Group Notes
Doco _MOOT0.md
Dev tools ell
Code config.py; global settings
data.py: data read and store
lib.py: misc utils
optimize.py: optimizer as data miner
Work in progress rules.sh: trying some xai
tree.sh
Demos rq0.sh: what data is here?
rq1.sh: how well do we optimize?
rq1.sh: statistical comparison of results from different sample sizes

Suggested rig:

cd moot0
bash rq1.sh # in this gist

If this crashes, edit rq1.sh and add in the right path to the MOOT data repo (installed above).

Plotting the rq1.sh results generates the following graph. In a result worthy of further research, increasing the sample size from 25 to 50 to 100 to 200 labels barely changes the results. Does this mean:

  • The following code is broken?
  • The data sets being explored here are simplistic?
  • Or that there are previously unreported ceiling effects in the MOOT data?
    • If so, does that hold just for MOOT, or does that suggestion a way to simplify a wide range of search-based optimization problems?
image

License

Licensed under the MIT License.
Copyright (c) 2025 Tim Menzies.

# cocoon.py — minimal, readable, 80-col, risk tables on top
from random import uniform
class o(dict):
__getattr__ = dict.get
__setattr__ = dict.__setitem__
def has(ako, lo=1, hi=6): return o(ako=ako, lo=lo, hi=hi)
def have():
p, n, s = "+", "-", "*"
return o({
"Acap": has(n), "Cplx": has(p,1,6), "Prec": has(s,1,6),
"Aexp": has(n), "Data": has(p,2,5), "Flex": has(s,1,6),
"Ltex": has(n), "Docu": has(p), "Arch": has(s,1,6),
"Pcap": has(n), "Pvol": has(p,2,5), "Team": has(s,1,6),
"Pcon": has(n), "Rely": has(p), "Pmat": has(s,1,6),
"Plex": has(n), "Ruse": has(p,2,6),
"Sced": has(n), "Stor": has(p,3,6),
"Site": has(n), "Time": has(p,3,6), "Tool": has(n)
})
_ = None
ne = [
[_, _, _, 1, 2, _],
[_, _, _, _, 1, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _]
]
ne46 = [
[_, _, _, 1, 2, 4],
[_, _, _, _, 1, 2],
[_, _, _, _, _, 1],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _]
]
nw = [
[2, 1, _, _, _, _],
[1, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _]
]
nw4 = [
[4, 2, 1, _, _, _],
[2, 1, _, _, _, _],
[1, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _]
]
sw = [
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[1, _, _, _, _, _],
[2, 1, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _]
]
sw4 = [
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[1, _, _, _, _, _],
[2, 1, _, _, _, _],
[4, 2, 1, _, _, _],
[_, _, _, _, _, _]
]
sw26 = [
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[1, _, _, _, _, _],
[2, 1, _, _, _, _],
[_, _, _, _, _, _]
]
sw46 = [
[_, _, _, _, _, _],
[_, _, _, _, _, _],
[1, _, _, _, _, _],
[2, 1, _, _, _, _],
[4, 2, 1, _, _, _],
[_, _, _, _, _, _]
]
risk = o({
"Cplx": o({"Acap": sw46, "Pcap": sw46, "Tool": sw46}),
"Ltex": o({"Pcap": nw4}),
"Pmat": o({"Acap": nw, "Pcap": sw46}),
"Pvol": o({"Plex": sw}),
"Rely": o({"Acap": sw4, "Pcap": sw4, "Pmat": sw4}),
"Ruse": o({"Aexp": sw46, "Ltex": sw46}),
"Sced": o({
"Cplx": ne46, "Time": ne46, "Pcap": nw4, "Aexp": nw4, "Acap": nw4,
"Plex": nw4, "Ltex": nw, "Pmat": nw, "Rely": ne, "Pvol": ne, "Tool": nw
}),
"Stor": o({"Acap": sw46, "Pcap": sw46}),
"Team": o({"Aexp": nw, "Sced": nw, "Site": nw}),
"Time": o({"Acap": sw46, "Pcap": sw46, "Tool": sw26}),
"Tool": o({"Acap": nw, "Pcap": nw, "Pmat": nw})
})
def from_(lo, hi): return uniform(lo, hi)
def within(x, lo, hi): return max(lo, min(hi, x))
def int_(x): return int(x)
def y(sign, z):
if sign == "+": return (z-3)*from_(0.073, 0.21)
if sign == "-": return (z-3)*from_(-0.187, -0.078)
return (z-6)*from_(-1.58, -1.014)
def effort(i, coc):
em, sf = 1, 0
for k, t in coc.items():
if not isinstance(t, dict): continue
v = i.y[k]
if t.ako in ("+", "-"): em *= v
else: sf += v
return i.y.a * (i.y.loc ** (i.y.b + 0.01*sf)) * em
def risks(i, r):
if not r: return 0
return sum(
r[a1][a2][i.x[a1]][i.x[a2]]
for a1 in r for a2 in r[a1]
if r[a1][a2][i.x[a1]][i.x[a2]] is not None
) / 108
def cocoon(i=None, coc=None, r=None):
i = i or have()
coc = coc or i
r = r or risk
if "x" not in i: i.x = o()
if "y" not in i: i.y = o()
for k, t in coc.items():
if not isinstance(t, dict): continue
lo, hi = t.lo, t.hi
old = i.x.get(k)
i.x[k] = int_(within(old, lo, hi)) if old is not None else int_(from_(lo, hi))
if k not in i.y: i.y[k] = y(t.ako, i.x[k])
i.y.a = i.y.get("a") or from_(2.3, 9.18)
i.y.b = i.y.get("b") or ((.85-1.1)/(9.18-2.2))*i.y.a + .9 + (1.2-.8)/2
i.y.loc = i.y.get("loc") or from_(2, 2000)
i.y.effort = effort(i, coc)
i.y.risk = risks(i, r)
return i
if __name__ == "__main__":
m = have()
m = cocoon(m)
print("x:", m.x)
print("y:", m.y)
print("effort:", m.y.effort, "risk:", m.y.risk)
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
"""
moot0.py: active learning for multi-objective reasoning
(c)2025 Tim Menzies, MIT license
Explains more with less: picks just a few examples, builds tiny models,
and shows where trade-offs hide.
Options:
-b big=1e32 an impossibly big numner
-B Budget=25 number of samples to build a model
-C Check=5 number of samples to test a model
-c cuts=5 number of deivisions of numerics
-e eps=.5 min list size = len**eps
-F Few=2048 number of random samples for exploration
-k k=1 Laplace smoothing for Bayes attribute smoothing
-m m=2 m-estimate for Bayes class smoothing
-p p=2 distance coefficient (2 = Euclidean)
-r repeats=20 number of repeated runs with new seeds
-s seed=42 random seed
-f file=~/gits/moot/optimize/config/SS-A.csv
"""
from types import SimpleNamespace as obj
import lib,re
def eg_h(): print(__doc__)
the = obj(**{k:lib.coerce(v)
for k,v in re.findall(r"(\w+)=(\S+)",__doc__)}) #pyright:ignore
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
from config import the,obj
from math import sqrt
#### Constructors --------------------------------------------------------------
def NUM(at=0,s=" "): return obj(it=NUM, at=at, txt=s, n=0, mu=0, m2=0, sd=0,
lo=the.big, hi=-the.big, optimal=s[-1] != "-")
def SYM(at=0,s=" "): return obj(it=SYM, at=at, txt=s, n=0, has={})
def COLS(names):
all = [(NUM if s[0].isupper() else SYM)(i,s) for i,s in enumerate(names)]
tmp = [col for col in all if col.txt[-1] == '!']
return obj(it=COLS, names=names, all=all, klass=(tmp[0] if tmp else None),
y = [col for col in all if col.txt[-1] in "-+"],
x = [col for col in all if col.txt[-1] not in "-+!X"])
def DATA(src, txt=""):
src = iter(src)
return adds(src, obj(it=DATA, n=0, txt=txt, rows=[], cols=COLS(next(src))))
def clone(data, rows=None, txt=""):
"Make a new data, coying some prior DATA structure."
return adds(rows or [], DATA([data.cols.names], txt))
#### Update --------------------------------------------------------------------
def adds(src, it=None):
it = it or NUM(); [add(it,x) for x in src]; return it
def sub(x,v): return add(x,v,-1)
def add(x, v,inc=1):
if v == "?": return v
x.n += inc
if x.it is SYM:
x.has[v] = inc + x.has.get(v,0)
elif x.it is DATA:
[add(col, v[col.at],inc) for col in x.cols.all]
(x.rows.append if inc > 0 else x.rows.remove)(v)
elif x.it is NUM:
x.lo, x.hi = min(v, x.lo), max(v, x.hi)
if inc<0:
x.mu = x.m2 = x.sd = x.n = 0
else:
d = v - x.mu
x.mu += inc * (d / x.n)
x.m2 += inc * (d * (v - x.mu))
x.sd = 0 if x.n < 2 else sqrt(max(0,x.m2)/(x.n-1))
return v
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
from config import the
from data import NUM
from math import sqrt,log,pi
### Query ----------------------------------------------------------------------
def norm(c,v) : return v if v=="?" else (v - c.lo) / (c.hi - c.lo + 1/the.big)
def mids(data) : return [mid(c) for c in data.cols.all]
def mid(c) : return c.mu if c.it is NUM else max(c.has,key=c.has.get)
def div(c) : return c.sd if c.it is NUM else entropy(c.has)
def entropy(d):
N = sum(d.values())
return -sum(p*log(p,2) for n in d.values() if (p:=n/N) > 0)
### Distance -------------------------------------------------------------------
def dist(src):
d,n = 0,0
for v in src: n, d = n+1, d + v**the.p
return (d/n) ** (1/the.p)
def disty(data, row):
return dist(abs(norm(c, row[c.at]) - c.optimal) for c in data.cols.y)
def distx(data, row1, row2):
return dist(_x(c, row1[c.at], row2[c.at]) for c in data.cols.x)
def _x(col, a,b):
if a==b=="?": return 1
if col.it is NUM:
a,b = norm(col,a), norm(col,b)
a = a if a != "?" else (0 if b>0.5 else 1) #pyright:ignore
b = b if b != "?" else (0 if a>0.5 else 1) #pyright:ignore
return abs(a - b)
return a != b
### Likelihood -------------------------------------------------------------------
def like(col, v, prior=0):
ε = 1 / the.big
if col.it is NUM:
var = col.sd**2 + ε
return -(v - col.mu)**2 / (2*var) - 0.5*log(2*pi*var)
return log(max(ε, (col.has.get(v,0) + the.k*prior) / (col.n + the.k + ε)))
def likes(data, row, nall=100, nh=2):
b4 = (data.n + the.m) / (nall + the.m*nh)
return log(b4) + sum(like(x, v, b4) for x in data.cols.x
if (v:=row[x.at]) != "?") #pyright:ignore
#!/usr/bin/env bash
usage() { cat <<'EOF' | fmt -70
ell: tiny portable shell environment: self installs, useful prompts/aliases/etc.
Copyright (c) 2025 Tim Menzies, MIT License.
https://opensource.org/licenses/MIT
Usage: sh ell
Motto: Own what matters (VITAL), skip what doesn't (YAGNI).
Why ell? Because you don't want a manual— you want control. Not
helpless, but motivated and enabled.You'd rather fix friction than
live with it. It can be better and you can make it so. Such control
should be easy, fast, portable: minimal setup, no dependencies, no
root. Code should start, work, and leave nothing behind.
But control doesn't mean building everything. YAGNI (You Ain't Gonna
Need It) says skip unnecessary features. VITAL (Vital Infrastructure:
Acquire Locally) says own what's critical. Your shell environment?
Critical. Every flag in ls or awk? Not critical.
When critical infrastructure lives elsewhere, you're exposed.
Remember left-pad? Supply-chain attacks? Richard Hipp, creator of
SQLite, understood this. He called it "backpacking"—carrying only
what you need. He nearly built on Berkeley DB until Oracle bought
it and started charging licensing fees. But Hipp didn't care— he'd
already written his own engine. Now it's the most deployed database
in history. Self-reliance at scale.
ell is lightweight control. I don't reboot or replace my OS— I refine,
then release, with nothing left behind. Containers add weight. Nix
adds complexity. Package managers promised simplicity; now we script
around them.
ell is readable, portable, and teaches by showing: shell variables,
directories, temp files, small tools playing nicely. Each trick
opens a door to design and simplicity. Its methods can last decades.
Your parents could have written this; your kids might fix it. But
will they grasp (e.g.) .toml and why it is replacing setup.py?
So why wrestle chaos when control can be simple? Take the ell
challenge: strip it down, make it run, walk away clean.
EOF
}
NEED="git python3 nvim gawk tree ruff pyright"
OPTIONAL="bat cmatrix eza gawk htop micro ncdu tree watch yazi zellij"
bold=$(tput bold) col0=$(tput sgr0) col1=$(tput setaf 6) col2=$(tput setaf 3)
hi() { clear; echo "${col1}"; cat<<EOF
.-.
_/ ..\
( \ u/__ There is no escape...
\ \__) ... from (sh)ell.
/ \
__/ \
( _._.-._/
jgs '-'
EOF
echo "${col0}"
}
inst() {
local m=""
for p in $1; do command -v "$p" &>/dev/null || m+="$p "; done
[ "$m" ] && case "$(uname -s)" in
Darwin*) brew install $m ;;
Linux*) sudo apt install -y $m ;;
MINGW*) winget install $m ;;
esac
}
# If executed (not sourced), start bash with this as init file
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
hi
inst $NEED
exec bash --init-file "${BASH_SOURCE[0]}" -i
fi
# Core setup
Here="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
Xdir="$(basename "$(dirname "$Here")")"
export BASH_SILENCE_DEPRECATION_WARNING=1
export PATH="$Here:$PATH"
# Colors & prompt
branch() { git branch 2>/dev/null | awk '/^\*/ {print $2}'; }
dirty() { [[ -n $(git status -s 2>/dev/null) ]] && echo "*"; }
PROMPT_COMMAND='PS1="${bold}${col1}$(basename "$(dirname "$PWD")")/$(basename "$PWD")${col0} ${col2}$(branch)$(dirty)${col0} ▶ "'
# Essential aliases
alias ..='cd ..' c='clear' Q='exit'
alias l='ls -lh' la='ls -la' t='tree -L 1' ls="\ls --color"
alias gs='git status -sb' ga='git add .' gp='git push' gl='git log --oneline --graph --decorate'
alias gc='read -p "Commit message: " msg && git commit -am "$msg" && git push'
alias h='history'
alias py='python3 -B'
alias tree='tree -hC'
alias reload="source '$Here/ell' && echo ✅"
# nvim control
# - --clean: ignore config files, factory defaults
# - number + relativenumber: show line numbers with distance from cursor
# - cursorline: highlight current line | mouse=a: enable mouse in all modes
# - clipboard=unnamedplus: yank/paste uses system clipboard
# - ignorecase + smartcase: case-insensitive search unless capitals used
# - expandtab: spaces not tabs | tabstop + shiftwidth=4: 4-space indents
# - splitright + splitbelow: new splits open right/below (not left/above)
# - undofile: persistent undo across sessions | undodir: where undo files live
# - zaibatsu: color scheme
# - laststatus=2: always show status bar
# - statusline: custom format with file, position, percentage
# - netrw(Lex): file browser. no banner, tree view (style 3), split to side, 15% width
# - Q key: quit all buffers/windows at once
vi() {
nvim --clean \
--cmd "set number relativenumber cursorline mouse=a clipboard=unnamedplus ignorecase smartcase" \
--cmd "set expandtab tabstop=2 shiftwidth=2 splitright splitbelow" \
--cmd "set undofile undodir=~/.vim/undo" \
--cmd "colorscheme zaibatsu" \
--cmd "set laststatus=2" \
--cmd "set statusline=%#StatusLine#\ ▶\ %f\ %m%r%=%y\ ❖\ %l:%c\ ❖\ %p%%\ " \
--cmd "let g:netrw_banner=0 | let g:netrw_liststyle=3 | let g:netrw_browse_split=4 | let g:netrw_winsize=15" \
--cmd "nnoremap Q :quitall<CR>" \
"$@"
}
_TMP=$(mktemp -d)
alias mu="micro -config-dir '$_TMP'"
trap "rm -rf '$_TMP'" EXIT INT TERM
cat > "$_TMP/settings.json" <<'EOF'
{ "colorscheme": "atom-dark",
"tabsize": 2,
"tabstospaces": true,
"ruler": true,
"mouse": true,
"autoindent": true,
"cursorline": true,
"statusline": true,
"savecursor": true,
"saveundo": true
}
EOF
# History improvements
export HISTSIZE=10000
export HISTFILESIZE=20000
export HISTCONTROL=ignoredups:erasedups # No duplicates
# Mega useful extras
alias grep='grep --color=auto' less='less -R'
alias ports='lsof -i -P -n | grep LISTEN'
alias myip='curl -s ipinfo.io/ip'
mkcd() { mkdir -p "$1" && cd "$1"; }
plot() { plot -p -e 'plot "-"'; }
checks() { for f in *.py; do check $f; done; }
check() {
# E,F=errors, B=bugs, I=imports, N=naming, UP=modern syntax
# ignore: E501=line-length, E701/2=multi-statement, E731=lambda
# for more rules, see https://docs.astral.sh/ruff/rules/
echo $f
ruff check \
--select=E,F,B,I,N,UP \
--ignore=E501,E701,E702,E731 \
--output-format=concise "$1" 2>&1 \
| grep -v "All checks passed"
pyright "$1" 2>&1 | grep -E "^\s+.*:\d+:\d+"
}
getsMac() {
inst $OPTIONAL
brew install --cask font-fira-code-nerd-font
alias ls='eza --icons'
alias cat='bat -p'
}
# Create a bash init script with MIT license that:
# - Checks for required tools (git, python3, nvim, gawk, tree, ruff, pyright)
# - Shows ASCII art greeting when launched
# - Can be executed directly or sourced
# - Sets up colorized prompt showing: parent/current dir and git branch
# - Includes essential aliases for: navigation, git workflow, file listing
# - Tries to work wth off-the-shelf config with minimal installs (currently, none)
# - Don't overwrite existing config files; instead, make config in temp dirs
# - Creates a vi() function wrapping nvim with: line numbers, mouse support,
# system clipboard, smart search, 4-space tabs, persistent undo, custom
# statusline, and configured netrw file browser
# - Creates a micro() function with useful defaults from that editor.
# - Add comprehensive inline comments explaining each nvim setting
# - Include history improvements (no duplicates, larger size)
# - Add utility functions: mkcd, plot, check (ruff+pyright)
# - Use tput for colors, keep style compact and professional
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
from data import DATA, SYM, add, clone, the
from datafun import distx
from lib import cli, csv, shuffle
from pathlib import Path
def eg__diabetes(): _classify(Path.home() / "gits/moot/classify/diabetes.csv")
def eg__soybean() : _classify(Path.home() / "gits/moot/classify/soybean.csv")
#------------------------------------------------------------------------------
if __name__ == "__main__": cli(the, globals())
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
import traceback, fileinput, random, sys, os
def shuffle(lst): random.shuffle(lst); return lst
def csv(file=None):
file = os.path.expanduser(file) if file else "-"
for line in fileinput.input(files=file):
if (line := line.split("%")[0]):
yield [coerce(s.strip()) for s in line.split(",")]
def coerce(s):
try: return int(s)
except Exception:
try: return float(s)
except Exception: return {'True':True, 'False':False}.get(s,s)
def cli(settings, funs):
"Update settings from command line. Maybe call 'eg' functions."
for n,s in enumerate(sys.argv):
if (fn := funs.get(f"eg{s.replace('-', '_')}")):
random.seed(settings.seed)
try: fn()
except Exception: traceback.print_exc()
else:
for key in vars(settings):
if s=="-"+key[0]:
settings.__dict__[key] = coerce(sys.argv[n+1])
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
from data import DATA, SYM, add, clone, the
from datafun import like,likes
from lib import cli, csv, shuffle
from pathlib import Path
def classify(datas, row):
nall = sum(data.n for data in datas)
return max(datas, key=lambda d : likes(d,row,nall,len(datas)))
def _classify(file):
acc, datas, data = 0, {}, None
for i,row in enumerate(csv(file)):
if i==0: data = DATA([row])
else:
want = row[data.cols.klass.at]
if i >= 5:
acc += want == classify(datas.values(), row).txt
datas[want] = datas.get(want) or clone(data, txt=want)
add(datas[want], row)
print(int(100*acc/(i-5)))
def eg__diabetes(): _classify(Path.home() / "gits/moot/classify/diabetes.csv")
def eg__soybean() : _classify(Path.home() / "gits/moot/classify/soybean.csv")
#------------------------------------------------------------------------------
if __name__ == "__main__": cli(the, globals())
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
from config import the
from data import DATA, NUM, clone, adds,add
from datafun import disty, distx, mids
from lib import cli,csv
from math import sqrt
import random,re
from stats import top
def optimize(data):
"Sort test data on model learned from training data. Check first few items."
random.shuffle(data.rows)
KnowAll = lambda r: disty(data,r)
rows = data.rows
n = len(rows)//2
train,test = rows[:n], rows[n:]
labelled = clone(data, train[:the.Budget])
test = sorted(test[:the.Few], key = model(labelled))
out = min(test[:the.Check], key = KnowAll)
labelled.rows.sort(key=KnowAll)
return KnowAll(out), out, labelled
def model(labelled):
"Return something that guesses sort order of unlabelled rows, using x vals."
KnowSome = lambda r: disty(labelled,r)
labelled.rows.sort(key = KnowSome)
b = int(sqrt(the.Budget))
best = mids(clone(labelled, labelled.rows[:b])) # mid of best labelled
rest = mids(clone(labelled, labelled.rows[b:])) # mid of other labelled
return lambda row: distx(labelled,row,best) - distx(labelled,row,rest)
#-------------------------------------------------------------------------------
def eg__run():
data = DATA(csv(the.file))
stats = adds(disty(data,row) for row in data.rows)
win = lambda v: int(100* (1 - (v - stats.lo)/ (stats.mu - stats.lo)))
wins = adds(win(optimize(data)[0]) for _ in range(the.repeats))
print(f"file= {the.file} ",
*[f"{k}= {stats.__dict__[k]:.3f}" for k in ["lo","mu","hi"]],
"win=", round(wins.mu,3))
def eg__samples():
data = DATA(csv(the.file))
stats = adds(disty(data,row) for row in data.rows)
win = lambda v: int(100* (1 - (v - stats.lo)/ (stats.mu - stats.lo)))
def run(s):
random.shuffle(data.rows)
the.Check = 5
the.Budget=s - the.Check
return win(optimize(data)[0])
rxs = {s:[run(s) for _ in range(the.repeats)] for s in [10,20,40,60,80,160]}
best = top(rxs,reverse=True, eps=stats.sd*.35)
for s,lst in rxs.items():
print(f"{"! " if s in best else " "}n={s} : {int(adds(lst).mu):3}",end=", ")
print(f"y : {len(data.cols.y):2}, x : {len(data.cols.x):4}, r : {len(data.rows):6}",end=", ")
print(re.sub("^.*/","",the.file))
if __name__=="__main__": cli(the, globals())
#!/usr/bin/env bash
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
WORDS=20
FILES=~/gits/moot/optimize/*/*.csv
show() { gawk '{printf"%s ",$1}NR%'$WORDS'<1{print""}END{print""}'; }
printf "\n|projects|\n"
ls $FILES | gawk 'END {print NR}'
printf "\nIn the following, each line is $WORDS results.\n"
printf "\n|Y|\n"
head -1 $FILES | grep , | gawk -F, '{print gsub(/[+-]/,0)}' | sort -n | show
printf "\n|X|\n"
head -1 $FILES | grep , | gawk -F, '{print NF - gsub(/[+-]/,0)}' | sort -n | show
printf "\n|rows|\n"
wc -l $FILES | gawk '/total/ {next} {print $1 - 1}' | sort -n | show
#!/usr/bin/env bash
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
FILES=~/gits/moot/optimize/*/*.csv
mkdir -p ~/tmp
for config in "160 10 150" "80 10 70" "40 10 30" "20 5 15"; do
read samples C B <<< "$config"
printf "%s\n" $FILES | \
xargs -P 10 -I {} python3 -B optimize.py -C $C -B $B -f {} --run | \
sort | awk '{print $NF}' > ~/tmp/$samples
done
echo "n samples=160 samples=80 samples=40 samples=20"
paste ~/tmp/{160,80,40,20} | sort -n | cat -n
echo -e "PCT\t320\t160\t80\t40\t20"
for p in 10 30 50 70 90; do
echo -n "$p%"
for n in 160 80 40 20; do
sort -n ~/tmp/$n |
awk '{a[NR]=$1} END {printf "\t%s", a[int(NR*'$p'/100)]}'
done
echo
done
#!/usr/bin/env bash
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
FILES=~/gits/moot/optimize/*/*.csv
mkdir -p ~/tmp
printf "%s\n" $FILES |
xargs -P 20 -I {} python3 -u -B optimize.py -C 5 -f {} --samples |
tee ~/tmp/samples |
grep --color=always --line-buffered '!'
cat ~/tmp/samples |
gawk -F' *[,:] +| +' '
{for(i=1;i<=NF;i++) {
if (sub(/!/,"",$i)) {
n[$(i+1)] += 1
sum[$(i+1)] += $(i+2) }}}
END {for(i in n)
print "samples : " i ",\twins : " int(100*n[i]/NR) " %,\twhen winning :" int(sum[i]/n[i])}'
# samples : 25, wins : 39 %, win :64
# samples : 50, wins : 60 %, win :69
# samples : 100, wins : 80 %, win :71
# samples : 200, wins : 94 %, win :73
import random, math
def same(x, y,Ks=0.95,Delta="smed"):
"True if x,y indistinguishable and differ by just a small effect."
x, y = sorted(x), sorted(y)
n, m = len(x), len(y)
def _cliffs():
"How frequently are x items are gt,lt than y items?"
gt = sum(a > b for a in x for b in y)
lt = sum(a < b for a in x for b in y)
return abs(gt - lt) / (n * m)
def _ks():
"Return max distance between cdf."
xs = sorted(x + y)
fx = [sum(a <= v for a in x)/n for v in xs]
fy = [sum(a <= v for a in y)/m for v in xs]
return max(abs(v1 - v2) for v1, v2 in zip(fx, fy))
ks = {0.1:1.22, 0.05:1.36, 0.01:1.63}[round(1 - Ks,2)]
cliffs= {'small':0.11,'smed':0.195,'medium':0.28,'large':0.43}[Delta]
return _cliffs() <= cliffs and _ks() <= ks * ((n + m)/(n * m))**0.5
#------------------------------------------------------------------------------
def top(rxs, reverse=False, same=same, eps=0.01, Ks=.95, Delta="smed"):
"Return the subset of rxs's keys associated with best scores."
its = sorted([(sum(v)/len(v), len(v),k,v) for k,v in rxs.items() if v],
reverse=reverse)
while len(its) > 1:
vals = [v for _, _, _, v in its]
mu = sum(l12 := sum(vals, [])) / len(l12)
cut, sc, left, right = 0, 0, [], []
for i in range(1, len(its)):
l1, l2 = sum(vals[:i], []), sum(vals[i:], [])
m1, m2 = sum(l1)/len(l1), sum(l2)/len(l2)
s = (len(l1)*(m1-mu)**2 + len(l2)*(m2-mu)**2) / len(l12)
if sc < s and abs(m1 - m2) > eps:
sc, cut, left, right = s, i, l1, l2
if not (cut > 0 and not same(left,right,Ks=Ks,Delta=Delta)): break
its = its[:cut]
return {k for _, _, k, _ in its}
#------------------------------------------------------------------------------
def eg__stats(m=20,n=15):
"FYI most of the time is in _cliffs"
def mu(lst): return sum(lst)/len(lst)
def r3(v): return round(v,2)
def weibull(n=100):
shape, scale = random.uniform(0.5, 3), random.uniform(1, 4)
return [r3(min(10, scale * (-math.log(random.random())) ** (1/shape) * 2.5))
for _ in range(n)]
samples = {x:weibull(n) for x in range(m)}
[print(x) for x in
sorted((r3(mu(lst)), i,sorted(lst)) for i,lst in samples.items())]
return top(samples,eps=0.05)
if __name__== "__main__": print(eg__stats())
#!/usr/bin/env python3 -B
# Copyright (c) 2025 Tim Menzies, MIT License
# https://opensource.org/licenses/MIT
# Tree functions from ezr.py, adapted for data.py structures
from data import NUM, SYM, DATA, disty, adds, add, sub, mid, div
from lib import csv, cli
from config import the, obj
# ## Tree Generation ----------------------------------------------------
def treeSelects(row, op, at, y):
"Have we selected this row?"
if (x:=row[at]) == "?" : return True
if op == "<=" : return x <= y
if op == "==" : return x == y
if op == ">" : return x > y
def Tree(data, rows=None, Y=None, Klass=None, how=None):
"Create tree from list of lists"
rows = rows or data.rows
Y = Y or (lambda row: disty(data,row))
Klass = Klass or NUM
tree = obj(rows=rows, how=how, kids=[], mu=mid(adds(Y(r) for r in rows)))
eps = 3 #nt(len(data.rows) ** the.eps)
if len(rows) > eps: #= the.eps ** len(data.rows):
best_spread, best_cuts = min(
(treeCuts(c, rows, Y, Klass) for c in data.cols.x), default=(the.big, []))
if best_spread < the.big:
for cut in best_cuts:
subset = [r for r in rows if treeSelects(r, *cut)]
if eps < len(subset) < len(rows):
tree.kids += [Tree(data, subset, Y, Klass, cut)]
return tree
def treeCuts(col, rows, Y, Klass):
"Return best cut for column at position 'at'"
xys = sorted((r[col.at], Y(r)) for r in rows if r[col.at] != "?")
return (_symCuts if col.it is SYM else _numCuts)(col.at, xys, Klass)
def _symCuts(at, xys, Klass):
"Cuts for symbolic column."
d = {}
for x, y in xys:
d[x] = d.get(x) or Klass()
add(d[x], y)
here = sum(ys.n/len(xys) * div(ys) for ys in d.values())
return here, [("==", at, x) for x in d]
def _numCuts(at, xys, Klass):
"Cuts for numeric columns."
spread, cuts, left, right = the.big, [], Klass(), Klass()
[add(right,y) for _, y in xys]
leaf_min = int(len(xys) ** the.eps)
for i, (x, y) in enumerate(xys[:-1]):
add(left, sub(right, y))
if x != xys[i+1][0]:
if leaf_min <= i < len(xys) - leaf_min:
now = (left.n*div(left) + right.n*div(right)) / (left.n+right.n)
if now < spread:
spread = now
cuts = [("<=", at, x), (">", at, x)]
return spread, cuts
# ## Tree Processing -------------------------------------------------
def treeLeaf(tree, row):
"Find which leaf a row belongs to"
for kid in tree.kids:
if treeSelects(row, *kid.how): return treeLeaf(kid, row)
return tree
def treeShow(data, tree, lvl=0, win=None):
"Display tree structure recursively"
if not win:
print(len(tree.rows))
b4 = adds(disty(data, row) for row in data.rows)
win = lambda v: int(100 * (1 - (v - b4.lo) / (b4.mu - b4.lo)))
if tree.how:
op, at, y = tree.how
y = round(y, 2) if isinstance(y, float) else y
leaf = ("<"+("-"*len(tree.rows))) if not tree.kids else ""
print(f"{win(tree.mu):4}: {'| '*(lvl-1)}if {data.cols.names[at]} {op} {y} {leaf}")
for kid in sorted(tree.kids, key=lambda k: k.mu):
treeShow(data, kid, lvl+1, win)
def eg__tree():
from optimize import optimize
data = DATA(csv(the.file))
_, _, labelled = optimize(data)
tree = Tree(labelled)
treeShow(data, tree)
if __name__ == "__main__": cli(the, globals())
@timm
Copy link
Author

timm commented Oct 27, 2025

At the end of ell, after all function definitions, let ell run command line args

then use ell to debug pre-commit hooks before adding them them to .git/hooks/pre-commit

if declare -f "$1" > /dev/null; then
  "$@"
fi

# Otherwise, if executed (not sourced), start interactive bash
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
    hi
    inst $NEED
    exec bash --init-file "${BASH_SOURCE[0]}" -i
fi

@timm
Copy link
Author

timm commented Oct 27, 2025

move lies and like into data. make nbc.py real short

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment