exd02 · March 9, 2026 16:13 · exd02 · Mar 9, 2026
diff --git a/analyze_order_k.py b/analyze_order_k.py
 import sys
 from collections import defaultdict
 from pathlib import Path
 from math import log2

 # =========================
 # Consts
 # =========================
 FILE_MAGIC = b"UTXO"
 FILE_VERSION = 0
 FILE_HEADER_SIZE = 9  # 4 bytes magic + 1 byte version + 4 bytes stop_height
 BLOCK_HEADER_SIZE = 12  # 4 bytes height + 8 bytes offset

 # =========================
 # Util
 # =========================
 def fmt(n: int) -> str:
    return f"{n:,}".replace(",", ".")

 # =========================
 # Parser hintfile
 # =========================
 def read_hintfile(path: str):
    path = Path(path)

    with path.open("rb") as f:
        data = f.read()

    if len(data) < FILE_HEADER_SIZE:
        raise ValueError("Invalid hint file: too small")

    magic = data[0:4]
    version = data[4]
    stop_height = int.from_bytes(data[5:9], "little")

    if magic != FILE_MAGIC or version != FILE_VERSION:
        raise ValueError("Invalid magic/version")

    offsets = {}
    cursor = FILE_HEADER_SIZE
    for _ in range(stop_height):
        height = int.from_bytes(data[cursor:cursor + 4], "little")
        offset = int.from_bytes(data[cursor + 4:cursor + 12], "little")
        offsets[height] = offset
        cursor += BLOCK_HEADER_SIZE

    return data, offsets

 # =========================
 # Order-k entropy analysis
 # =========================
 def analyze_block_order_k(bit_bytes: bytes, bit_count: int, k: int, counts: dict):
    """
    Accumulate context counts for order-k entropy estimation.

    For each position i >= k, reads the k-bit context (bits i-k..i-1)
    and the next bit (bit i), then increments counts[context][next_bit].

    Context is stored as an integer (the k bits packed into an int,
    MSB = oldest bit).

    Blocks are treated as independent sequences: the context does NOT
    carry over between blocks.

    Args:
        bit_bytes: raw bytes of the block bit array
        bit_count: number of valid bits in this block
        k: context order
        counts: dict[int -> [int, int]] mapping context -> [count_0, count_1]
    """
    if bit_count <= k:
        return

    # Unpack all bits of this block into a list first — avoids re-extracting
    # individual bits repeatedly in the inner loop
    bits = []
    for byte in bit_bytes:
        for i in range(7, -1, -1):
            bits.append((byte >> i) & 1)
            if len(bits) == bit_count:
                break
        if len(bits) == bit_count:
            break

    # Seed the initial context from the first k bits
    context = 0
    for i in range(k):
        context = (context << 1) | bits[i]

    mask = (1 << k) - 1

    for i in range(k, bit_count):
        next_bit = bits[i]

        if context not in counts:
            counts[context] = [0, 0]
        counts[context][next_bit] += 1

        # Slide context window: drop oldest bit, append next_bit
        context = ((context << 1) | next_bit) & mask

 def analyze_hintfile_order_k(path: str, k: int):
    """
    Analyze the full hintfile and compute order-k conditional entropy.

    Returns:
        tuple: (H_k in bits/symbol, total transitions counted, unique contexts)
    """
    data, offsets = read_hintfile(path)

    counts = {}  # context (int) -> [count_0, count_1]

    total_blocks = len(offsets)
    for i, (height, offset) in enumerate(offsets.items()):
        num_hints = int.from_bytes(data[offset:offset + 4], "little")

        byte_count = (num_hints + 7) // 8
        bit_start = offset + 4
        bit_bytes = data[bit_start:bit_start + byte_count]

        analyze_block_order_k(bit_bytes, num_hints, k, counts)

        if (i + 1) % 50_000 == 0:
            print(f"  [{i+1}/{total_blocks}] blocks processed, "
                  f"unique contexts so far: {fmt(len(counts))}", flush=True)

    # Compute H_k = sum over contexts: P(ctx) * H(next | ctx)
    total = 0
    weighted_entropy = 0.0

    for ctx_counts in counts.values():
        n0, n1 = ctx_counts
        n = n0 + n1
        total += n

        if n0 > 0 and n1 > 0:
            p0 = n0 / n
            p1 = n1 / n
            h = -(p0 * log2(p0) + p1 * log2(p1))
        else:
            h = 0.0  # deterministic context: zero entropy

        weighted_entropy += n * h

    h_k = weighted_entropy / total if total > 0 else 0.0
    return h_k, total, len(counts)

 # =========================
 # Main
 # =========================
 if __name__ == "__main__":
    hintfile = sys.argv[1] if len(sys.argv) > 1 else "main.hints"

    # k values to sweep — adjust upper bound as memory/time allows
    k_values = [1, 2, 4, 8, 12, 16, 20]

    print(f"Analyzing: {hintfile}")
    print()
    print(f"{'k':>4}  {'H_k (bits/sym)':>16}  {'unique contexts':>16}  {'size estimate':>16}")
    print("-" * 60)

    total_bits = None

    for k in k_values:
        print(f"k={k}...", flush=True)
        h_k, total, unique_ctx = analyze_hintfile_order_k(hintfile, k)

        if total_bits is None:
            total_bits = total  # approximately stable across k values

        size_mb = (h_k * total_bits) / 8 / 1024 / 1024

        print(f"{k:>4}  {h_k:>16.6f}  {fmt(unique_ctx):>16}  {size_mb:>13.1f} MB")
        print()
	import sys
	from collections import defaultdict
	from pathlib import Path
	from math import log2

	# =========================
	# Consts
	# =========================
	FILE_MAGIC = b"UTXO"
	FILE_VERSION = 0
	FILE_HEADER_SIZE = 9 # 4 bytes magic + 1 byte version + 4 bytes stop_height
	BLOCK_HEADER_SIZE = 12 # 4 bytes height + 8 bytes offset

	# =========================
	# Util
	# =========================
	def fmt(n: int) -> str:
	return f"{n:,}".replace(",", ".")

	# =========================
	# Parser hintfile
	# =========================
	def read_hintfile(path: str):
	path = Path(path)

	with path.open("rb") as f:
	data = f.read()

	if len(data) < FILE_HEADER_SIZE:
	raise ValueError("Invalid hint file: too small")

	magic = data[0:4]
	version = data[4]
	stop_height = int.from_bytes(data[5:9], "little")

	if magic != FILE_MAGIC or version != FILE_VERSION:
	raise ValueError("Invalid magic/version")

	offsets = {}
	cursor = FILE_HEADER_SIZE
	for _ in range(stop_height):
	height = int.from_bytes(data[cursor:cursor + 4], "little")
	offset = int.from_bytes(data[cursor + 4:cursor + 12], "little")
	offsets[height] = offset
	cursor += BLOCK_HEADER_SIZE

	return data, offsets

	# =========================
	# Order-k entropy analysis
	# =========================
	def analyze_block_order_k(bit_bytes: bytes, bit_count: int, k: int, counts: dict):
	"""
	Accumulate context counts for order-k entropy estimation.

	For each position i >= k, reads the k-bit context (bits i-k..i-1)
	and the next bit (bit i), then increments counts[context][next_bit].

	Context is stored as an integer (the k bits packed into an int,
	MSB = oldest bit).

	Blocks are treated as independent sequences: the context does NOT
	carry over between blocks.

	Args:
	bit_bytes: raw bytes of the block bit array
	bit_count: number of valid bits in this block
	k: context order
	counts: dict[int -> [int, int]] mapping context -> [count_0, count_1]
	"""
	if bit_count <= k:
	return

	# Unpack all bits of this block into a list first — avoids re-extracting
	# individual bits repeatedly in the inner loop
	bits = []
	for byte in bit_bytes:
	for i in range(7, -1, -1):
	bits.append((byte >> i) & 1)
	if len(bits) == bit_count:
	break
	if len(bits) == bit_count:
	break

	# Seed the initial context from the first k bits
	context = 0
	for i in range(k):
	context = (context << 1) \| bits[i]

	mask = (1 << k) - 1

	for i in range(k, bit_count):
	next_bit = bits[i]

	if context not in counts:
	counts[context] = [0, 0]
	counts[context][next_bit] += 1

	# Slide context window: drop oldest bit, append next_bit
	context = ((context << 1) \| next_bit) & mask

	def analyze_hintfile_order_k(path: str, k: int):
	"""
	Analyze the full hintfile and compute order-k conditional entropy.

	Returns:
	tuple: (H_k in bits/symbol, total transitions counted, unique contexts)
	"""
	data, offsets = read_hintfile(path)

	counts = {} # context (int) -> [count_0, count_1]

	total_blocks = len(offsets)
	for i, (height, offset) in enumerate(offsets.items()):
	num_hints = int.from_bytes(data[offset:offset + 4], "little")

	byte_count = (num_hints + 7) // 8
	bit_start = offset + 4
	bit_bytes = data[bit_start:bit_start + byte_count]

	analyze_block_order_k(bit_bytes, num_hints, k, counts)

	if (i + 1) % 50_000 == 0:
	print(f" [{i+1}/{total_blocks}] blocks processed, "
	f"unique contexts so far: {fmt(len(counts))}", flush=True)

	# Compute H_k = sum over contexts: P(ctx) * H(next \| ctx)
	total = 0
	weighted_entropy = 0.0

	for ctx_counts in counts.values():
	n0, n1 = ctx_counts
	n = n0 + n1
	total += n

	if n0 > 0 and n1 > 0:
	p0 = n0 / n
	p1 = n1 / n
	h = -(p0 * log2(p0) + p1 * log2(p1))
	else:
	h = 0.0 # deterministic context: zero entropy

	weighted_entropy += n * h

	h_k = weighted_entropy / total if total > 0 else 0.0
	return h_k, total, len(counts)

	# =========================
	# Main
	# =========================
	if __name__ == "__main__":
	hintfile = sys.argv[1] if len(sys.argv) > 1 else "main.hints"

	# k values to sweep — adjust upper bound as memory/time allows
	k_values = [1, 2, 4, 8, 12, 16, 20]

	print(f"Analyzing: {hintfile}")
	print()
	print(f"{'k':>4} {'H_k (bits/sym)':>16} {'unique contexts':>16} {'size estimate':>16}")
	print("-" * 60)

	total_bits = None

	for k in k_values:
	print(f"k={k}...", flush=True)
	h_k, total, unique_ctx = analyze_hintfile_order_k(hintfile, k)

	if total_bits is None:
	total_bits = total # approximately stable across k values

	size_mb = (h_k * total_bits) / 8 / 1024 / 1024

	print(f"{k:>4} {h_k:>16.6f} {fmt(unique_ctx):>16} {size_mb:>13.1f} MB")
	print()
No results found