Created
March 9, 2026 16:13
-
-
Save exd02/c7bb8016e9533548a8e7307ab25c67cb to your computer and use it in GitHub Desktop.
Markov model analysis for hintfiles
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from math import log2 | |
| # ========================= | |
| # Consts | |
| # ========================= | |
| FILE_MAGIC = b"UTXO" | |
| FILE_VERSION = 0 | |
| FILE_HEADER_SIZE = 9 # 4 bytes magic + 1 byte version + 4 bytes stop_height | |
| BLOCK_HEADER_SIZE = 12 # 4 bytes height + 8 bytes offset | |
| # ========================= | |
| # Util | |
| # ========================= | |
| def fmt(n: int) -> str: | |
| return f"{n:,}".replace(",", ".") | |
| # ========================= | |
| # Parser hintfile | |
| # ========================= | |
| def read_hintfile(path: str): | |
| path = Path(path) | |
| with path.open("rb") as f: | |
| data = f.read() | |
| if len(data) < FILE_HEADER_SIZE: | |
| raise ValueError("Invalid hint file: too small") | |
| magic = data[0:4] | |
| version = data[4] | |
| stop_height = int.from_bytes(data[5:9], "little") | |
| if magic != FILE_MAGIC or version != FILE_VERSION: | |
| raise ValueError("Invalid magic/version") | |
| offsets = {} | |
| cursor = FILE_HEADER_SIZE | |
| for _ in range(stop_height): | |
| height = int.from_bytes(data[cursor:cursor + 4], "little") | |
| offset = int.from_bytes(data[cursor + 4:cursor + 12], "little") | |
| offsets[height] = offset | |
| cursor += BLOCK_HEADER_SIZE | |
| return data, offsets | |
| # ========================= | |
| # Order-k entropy analysis | |
| # ========================= | |
| def analyze_block_order_k(bit_bytes: bytes, bit_count: int, k: int, counts: dict): | |
| """ | |
| Accumulate context counts for order-k entropy estimation. | |
| For each position i >= k, reads the k-bit context (bits i-k..i-1) | |
| and the next bit (bit i), then increments counts[context][next_bit]. | |
| Context is stored as an integer (the k bits packed into an int, | |
| MSB = oldest bit). | |
| Blocks are treated as independent sequences: the context does NOT | |
| carry over between blocks. | |
| Args: | |
| bit_bytes: raw bytes of the block bit array | |
| bit_count: number of valid bits in this block | |
| k: context order | |
| counts: dict[int -> [int, int]] mapping context -> [count_0, count_1] | |
| """ | |
| if bit_count <= k: | |
| return | |
| # Unpack all bits of this block into a list first — avoids re-extracting | |
| # individual bits repeatedly in the inner loop | |
| bits = [] | |
| for byte in bit_bytes: | |
| for i in range(7, -1, -1): | |
| bits.append((byte >> i) & 1) | |
| if len(bits) == bit_count: | |
| break | |
| if len(bits) == bit_count: | |
| break | |
| # Seed the initial context from the first k bits | |
| context = 0 | |
| for i in range(k): | |
| context = (context << 1) | bits[i] | |
| mask = (1 << k) - 1 | |
| for i in range(k, bit_count): | |
| next_bit = bits[i] | |
| if context not in counts: | |
| counts[context] = [0, 0] | |
| counts[context][next_bit] += 1 | |
| # Slide context window: drop oldest bit, append next_bit | |
| context = ((context << 1) | next_bit) & mask | |
| def analyze_hintfile_order_k(path: str, k: int): | |
| """ | |
| Analyze the full hintfile and compute order-k conditional entropy. | |
| Returns: | |
| tuple: (H_k in bits/symbol, total transitions counted, unique contexts) | |
| """ | |
| data, offsets = read_hintfile(path) | |
| counts = {} # context (int) -> [count_0, count_1] | |
| total_blocks = len(offsets) | |
| for i, (height, offset) in enumerate(offsets.items()): | |
| num_hints = int.from_bytes(data[offset:offset + 4], "little") | |
| byte_count = (num_hints + 7) // 8 | |
| bit_start = offset + 4 | |
| bit_bytes = data[bit_start:bit_start + byte_count] | |
| analyze_block_order_k(bit_bytes, num_hints, k, counts) | |
| if (i + 1) % 50_000 == 0: | |
| print(f" [{i+1}/{total_blocks}] blocks processed, " | |
| f"unique contexts so far: {fmt(len(counts))}", flush=True) | |
| # Compute H_k = sum over contexts: P(ctx) * H(next | ctx) | |
| total = 0 | |
| weighted_entropy = 0.0 | |
| for ctx_counts in counts.values(): | |
| n0, n1 = ctx_counts | |
| n = n0 + n1 | |
| total += n | |
| if n0 > 0 and n1 > 0: | |
| p0 = n0 / n | |
| p1 = n1 / n | |
| h = -(p0 * log2(p0) + p1 * log2(p1)) | |
| else: | |
| h = 0.0 # deterministic context: zero entropy | |
| weighted_entropy += n * h | |
| h_k = weighted_entropy / total if total > 0 else 0.0 | |
| return h_k, total, len(counts) | |
| # ========================= | |
| # Main | |
| # ========================= | |
| if __name__ == "__main__": | |
| hintfile = sys.argv[1] if len(sys.argv) > 1 else "main.hints" | |
| # k values to sweep — adjust upper bound as memory/time allows | |
| k_values = [1, 2, 4, 8, 12, 16, 20] | |
| print(f"Analyzing: {hintfile}") | |
| print() | |
| print(f"{'k':>4} {'H_k (bits/sym)':>16} {'unique contexts':>16} {'size estimate':>16}") | |
| print("-" * 60) | |
| total_bits = None | |
| for k in k_values: | |
| print(f"k={k}...", flush=True) | |
| h_k, total, unique_ctx = analyze_hintfile_order_k(hintfile, k) | |
| if total_bits is None: | |
| total_bits = total # approximately stable across k values | |
| size_mb = (h_k * total_bits) / 8 / 1024 / 1024 | |
| print(f"{k:>4} {h_k:>16.6f} {fmt(unique_ctx):>16} {size_mb:>13.1f} MB") | |
| print() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For each block we have: +4 bytes block height, +8 bytes block offset, +4 bytes total hints (not always, it's compactsize...) = 16 bytes * 916874 blocks = 14669984 bytes = ~14.67 megabytes.
Analysis Output
Considering current structure (keeping the header structure intact), the best achievable size in that scenario will be 14.67 MB + 80.7 MB = 95.37 MB.