Skip to content

Instantly share code, notes, and snippets.

@exd02
Created March 9, 2026 16:13
Show Gist options
  • Select an option

  • Save exd02/c7bb8016e9533548a8e7307ab25c67cb to your computer and use it in GitHub Desktop.

Select an option

Save exd02/c7bb8016e9533548a8e7307ab25c67cb to your computer and use it in GitHub Desktop.
Markov model analysis for hintfiles
import sys
from collections import defaultdict
from pathlib import Path
from math import log2
# =========================
# Consts
# =========================
FILE_MAGIC = b"UTXO"
FILE_VERSION = 0
FILE_HEADER_SIZE = 9 # 4 bytes magic + 1 byte version + 4 bytes stop_height
BLOCK_HEADER_SIZE = 12 # 4 bytes height + 8 bytes offset
# =========================
# Util
# =========================
def fmt(n: int) -> str:
return f"{n:,}".replace(",", ".")
# =========================
# Parser hintfile
# =========================
def read_hintfile(path: str):
path = Path(path)
with path.open("rb") as f:
data = f.read()
if len(data) < FILE_HEADER_SIZE:
raise ValueError("Invalid hint file: too small")
magic = data[0:4]
version = data[4]
stop_height = int.from_bytes(data[5:9], "little")
if magic != FILE_MAGIC or version != FILE_VERSION:
raise ValueError("Invalid magic/version")
offsets = {}
cursor = FILE_HEADER_SIZE
for _ in range(stop_height):
height = int.from_bytes(data[cursor:cursor + 4], "little")
offset = int.from_bytes(data[cursor + 4:cursor + 12], "little")
offsets[height] = offset
cursor += BLOCK_HEADER_SIZE
return data, offsets
# =========================
# Order-k entropy analysis
# =========================
def analyze_block_order_k(bit_bytes: bytes, bit_count: int, k: int, counts: dict):
"""
Accumulate context counts for order-k entropy estimation.
For each position i >= k, reads the k-bit context (bits i-k..i-1)
and the next bit (bit i), then increments counts[context][next_bit].
Context is stored as an integer (the k bits packed into an int,
MSB = oldest bit).
Blocks are treated as independent sequences: the context does NOT
carry over between blocks.
Args:
bit_bytes: raw bytes of the block bit array
bit_count: number of valid bits in this block
k: context order
counts: dict[int -> [int, int]] mapping context -> [count_0, count_1]
"""
if bit_count <= k:
return
# Unpack all bits of this block into a list first — avoids re-extracting
# individual bits repeatedly in the inner loop
bits = []
for byte in bit_bytes:
for i in range(7, -1, -1):
bits.append((byte >> i) & 1)
if len(bits) == bit_count:
break
if len(bits) == bit_count:
break
# Seed the initial context from the first k bits
context = 0
for i in range(k):
context = (context << 1) | bits[i]
mask = (1 << k) - 1
for i in range(k, bit_count):
next_bit = bits[i]
if context not in counts:
counts[context] = [0, 0]
counts[context][next_bit] += 1
# Slide context window: drop oldest bit, append next_bit
context = ((context << 1) | next_bit) & mask
def analyze_hintfile_order_k(path: str, k: int):
"""
Analyze the full hintfile and compute order-k conditional entropy.
Returns:
tuple: (H_k in bits/symbol, total transitions counted, unique contexts)
"""
data, offsets = read_hintfile(path)
counts = {} # context (int) -> [count_0, count_1]
total_blocks = len(offsets)
for i, (height, offset) in enumerate(offsets.items()):
num_hints = int.from_bytes(data[offset:offset + 4], "little")
byte_count = (num_hints + 7) // 8
bit_start = offset + 4
bit_bytes = data[bit_start:bit_start + byte_count]
analyze_block_order_k(bit_bytes, num_hints, k, counts)
if (i + 1) % 50_000 == 0:
print(f" [{i+1}/{total_blocks}] blocks processed, "
f"unique contexts so far: {fmt(len(counts))}", flush=True)
# Compute H_k = sum over contexts: P(ctx) * H(next | ctx)
total = 0
weighted_entropy = 0.0
for ctx_counts in counts.values():
n0, n1 = ctx_counts
n = n0 + n1
total += n
if n0 > 0 and n1 > 0:
p0 = n0 / n
p1 = n1 / n
h = -(p0 * log2(p0) + p1 * log2(p1))
else:
h = 0.0 # deterministic context: zero entropy
weighted_entropy += n * h
h_k = weighted_entropy / total if total > 0 else 0.0
return h_k, total, len(counts)
# =========================
# Main
# =========================
if __name__ == "__main__":
hintfile = sys.argv[1] if len(sys.argv) > 1 else "main.hints"
# k values to sweep — adjust upper bound as memory/time allows
k_values = [1, 2, 4, 8, 12, 16, 20]
print(f"Analyzing: {hintfile}")
print()
print(f"{'k':>4} {'H_k (bits/sym)':>16} {'unique contexts':>16} {'size estimate':>16}")
print("-" * 60)
total_bits = None
for k in k_values:
print(f"k={k}...", flush=True)
h_k, total, unique_ctx = analyze_hintfile_order_k(hintfile, k)
if total_bits is None:
total_bits = total # approximately stable across k values
size_mb = (h_k * total_bits) / 8 / 1024 / 1024
print(f"{k:>4} {h_k:>16.6f} {fmt(unique_ctx):>16} {size_mb:>13.1f} MB")
print()
@exd02
Copy link
Author

exd02 commented Mar 9, 2026

For each block we have: +4 bytes block height, +8 bytes block offset, +4 bytes total hints (not always, it's compactsize...) = 16 bytes * 916874 blocks = 14669984 bytes = ~14.67 megabytes.

Analysis Output
Analyzing: main.hints
   k    H_k (bits/sym)   unique contexts     size estimate
------------------------------------------------------------
k=1...
  [50000/916874] blocks processed, unique contexts so far: 2
  [100000/916874] blocks processed, unique contexts so far: 2
  [150000/916874] blocks processed, unique contexts so far: 2
  [200000/916874] blocks processed, unique contexts so far: 2
  [250000/916874] blocks processed, unique contexts so far: 2
  [300000/916874] blocks processed, unique contexts so far: 2
  [350000/916874] blocks processed, unique contexts so far: 2
  [400000/916874] blocks processed, unique contexts so far: 2
  [450000/916874] blocks processed, unique contexts so far: 2
  [500000/916874] blocks processed, unique contexts so far: 2
  [550000/916874] blocks processed, unique contexts so far: 2
  [600000/916874] blocks processed, unique contexts so far: 2
  [650000/916874] blocks processed, unique contexts so far: 2
  [700000/916874] blocks processed, unique contexts so far: 2
  [750000/916874] blocks processed, unique contexts so far: 2
  [800000/916874] blocks processed, unique contexts so far: 2
  [850000/916874] blocks processed, unique contexts so far: 2
  [900000/916874] blocks processed, unique contexts so far: 2
   1          0.249780                 2          103.0 MB
k=2...
  [50000/916874] blocks processed, unique contexts so far: 3
  [100000/916874] blocks processed, unique contexts so far: 4
  [150000/916874] blocks processed, unique contexts so far: 4
  [200000/916874] blocks processed, unique contexts so far: 4
  [250000/916874] blocks processed, unique contexts so far: 4
  [300000/916874] blocks processed, unique contexts so far: 4
  [350000/916874] blocks processed, unique contexts so far: 4
  [400000/916874] blocks processed, unique contexts so far: 4
  [450000/916874] blocks processed, unique contexts so far: 4
  [500000/916874] blocks processed, unique contexts so far: 4
  [550000/916874] blocks processed, unique contexts so far: 4
  [600000/916874] blocks processed, unique contexts so far: 4
  [650000/916874] blocks processed, unique contexts so far: 4
  [700000/916874] blocks processed, unique contexts so far: 4
  [750000/916874] blocks processed, unique contexts so far: 4
  [800000/916874] blocks processed, unique contexts so far: 4
  [850000/916874] blocks processed, unique contexts so far: 4
  [900000/916874] blocks processed, unique contexts so far: 4
   2          0.234132                 4           96.6 MB
k=4...
  [50000/916874] blocks processed, unique contexts so far: 5
  [100000/916874] blocks processed, unique contexts so far: 16
  [150000/916874] blocks processed, unique contexts so far: 16
  [200000/916874] blocks processed, unique contexts so far: 16
  [250000/916874] blocks processed, unique contexts so far: 16
  [300000/916874] blocks processed, unique contexts so far: 16
  [350000/916874] blocks processed, unique contexts so far: 16
  [400000/916874] blocks processed, unique contexts so far: 16
  [450000/916874] blocks processed, unique contexts so far: 16
  [500000/916874] blocks processed, unique contexts so far: 16
  [550000/916874] blocks processed, unique contexts so far: 16
  [600000/916874] blocks processed, unique contexts so far: 16
  [650000/916874] blocks processed, unique contexts so far: 16
  [700000/916874] blocks processed, unique contexts so far: 16
  [750000/916874] blocks processed, unique contexts so far: 16
  [800000/916874] blocks processed, unique contexts so far: 16
  [850000/916874] blocks processed, unique contexts so far: 16
  [900000/916874] blocks processed, unique contexts so far: 16
   4          0.220709                16           91.0 MB
k=8...
  [50000/916874] blocks processed, unique contexts so far: 9
  [100000/916874] blocks processed, unique contexts so far: 172
  [150000/916874] blocks processed, unique contexts so far: 256
  [200000/916874] blocks processed, unique contexts so far: 256
  [250000/916874] blocks processed, unique contexts so far: 256
  [300000/916874] blocks processed, unique contexts so far: 256
  [350000/916874] blocks processed, unique contexts so far: 256
  [400000/916874] blocks processed, unique contexts so far: 256
  [450000/916874] blocks processed, unique contexts so far: 256
  [500000/916874] blocks processed, unique contexts so far: 256
  [550000/916874] blocks processed, unique contexts so far: 256
  [600000/916874] blocks processed, unique contexts so far: 256
  [650000/916874] blocks processed, unique contexts so far: 256
  [700000/916874] blocks processed, unique contexts so far: 256
  [750000/916874] blocks processed, unique contexts so far: 256
  [800000/916874] blocks processed, unique contexts so far: 256
  [850000/916874] blocks processed, unique contexts so far: 256
  [900000/916874] blocks processed, unique contexts so far: 256
   8          0.210546               256           86.8 MB
k=12...
  [50000/916874] blocks processed, unique contexts so far: 3
  [100000/916874] blocks processed, unique contexts so far: 764
  [150000/916874] blocks processed, unique contexts so far: 4.095
  [200000/916874] blocks processed, unique contexts so far: 4.096
  [250000/916874] blocks processed, unique contexts so far: 4.096
  [300000/916874] blocks processed, unique contexts so far: 4.096
  [350000/916874] blocks processed, unique contexts so far: 4.096
  [400000/916874] blocks processed, unique contexts so far: 4.096
  [450000/916874] blocks processed, unique contexts so far: 4.096
  [500000/916874] blocks processed, unique contexts so far: 4.096
  [550000/916874] blocks processed, unique contexts so far: 4.096
  [600000/916874] blocks processed, unique contexts so far: 4.096
  [650000/916874] blocks processed, unique contexts so far: 4.096
  [700000/916874] blocks processed, unique contexts so far: 4.096
  [750000/916874] blocks processed, unique contexts so far: 4.096
  [800000/916874] blocks processed, unique contexts so far: 4.096
  [850000/916874] blocks processed, unique contexts so far: 4.096
  [900000/916874] blocks processed, unique contexts so far: 4.096
  12          0.203732             4.096           84.0 MB
k=16...
  [50000/916874] blocks processed, unique contexts so far: 0
  [100000/916874] blocks processed, unique contexts so far: 1.115
  [150000/916874] blocks processed, unique contexts so far: 36.053
  [200000/916874] blocks processed, unique contexts so far: 48.458
  [250000/916874] blocks processed, unique contexts so far: 65.536
  [300000/916874] blocks processed, unique contexts so far: 65.536
  [350000/916874] blocks processed, unique contexts so far: 65.536
  [400000/916874] blocks processed, unique contexts so far: 65.536
  [450000/916874] blocks processed, unique contexts so far: 65.536
  [500000/916874] blocks processed, unique contexts so far: 65.536
  [550000/916874] blocks processed, unique contexts so far: 65.536
  [600000/916874] blocks processed, unique contexts so far: 65.536
  [650000/916874] blocks processed, unique contexts so far: 65.536
  [700000/916874] blocks processed, unique contexts so far: 65.536
  [750000/916874] blocks processed, unique contexts so far: 65.536
  [800000/916874] blocks processed, unique contexts so far: 65.536
  [850000/916874] blocks processed, unique contexts so far: 65.536
  [900000/916874] blocks processed, unique contexts so far: 65.536
  16          0.200539            65.536           82.7 MB
k=20...
  [50000/916874] blocks processed, unique contexts so far: 0
  [100000/916874] blocks processed, unique contexts so far: 1.164
  [150000/916874] blocks processed, unique contexts so far: 95.804
  [200000/916874] blocks processed, unique contexts so far: 167.890
  [250000/916874] blocks processed, unique contexts so far: 929.005
  [300000/916874] blocks processed, unique contexts so far: 972.314
  [350000/916874] blocks processed, unique contexts so far: 993.345
  [400000/916874] blocks processed, unique contexts so far: 1.000.178
  [450000/916874] blocks processed, unique contexts so far: 1.004.240
  [500000/916874] blocks processed, unique contexts so far: 1.007.600
  [550000/916874] blocks processed, unique contexts so far: 1.012.234
  [600000/916874] blocks processed, unique contexts so far: 1.015.838
  [650000/916874] blocks processed, unique contexts so far: 1.021.894
  [700000/916874] blocks processed, unique contexts so far: 1.028.972
  [750000/916874] blocks processed, unique contexts so far: 1.032.114
  [800000/916874] blocks processed, unique contexts so far: 1.047.936
  [850000/916874] blocks processed, unique contexts so far: 1.048.574
  [900000/916874] blocks processed, unique contexts so far: 1.048.576
  20          0.198153         1.048.576           81.7 MB
k=22...
  [50000/916874] blocks processed, unique contexts so far: 0
  [100000/916874] blocks processed, unique contexts so far: 1.142
  [150000/916874] blocks processed, unique contexts so far: 132.327
  [200000/916874] blocks processed, unique contexts so far: 249.770
  [250000/916874] blocks processed, unique contexts so far: 1.962.206
  [300000/916874] blocks processed, unique contexts so far: 2.285.611
  [350000/916874] blocks processed, unique contexts so far: 2.497.297
  [400000/916874] blocks processed, unique contexts so far: 2.576.927
  [450000/916874] blocks processed, unique contexts so far: 2.628.055
  [500000/916874] blocks processed, unique contexts so far: 2.671.766
  [550000/916874] blocks processed, unique contexts so far: 2.734.019
  [600000/916874] blocks processed, unique contexts so far: 2.783.228
  [650000/916874] blocks processed, unique contexts so far: 2.865.721
  [700000/916874] blocks processed, unique contexts so far: 2.980.027
  [750000/916874] blocks processed, unique contexts so far: 3.045.524
  [800000/916874] blocks processed, unique contexts so far: 3.754.843
  [850000/916874] blocks processed, unique contexts so far: 4.111.131
  [900000/916874] blocks processed, unique contexts so far: 4.156.791
  22          0.196707         4.171.148           80.7 MB

Considering current structure (keeping the header structure intact), the best achievable size in that scenario will be 14.67 MB + 80.7 MB = 95.37 MB.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment