Skip to content

Instantly share code, notes, and snippets.

@matchaxnb
Last active September 19, 2025 15:34
Show Gist options
  • Select an option

  • Save matchaxnb/61a249c80a2fd85754e17a7d935267f4 to your computer and use it in GitHub Desktop.

Select an option

Save matchaxnb/61a249c80a2fd85754e17a7d935267f4 to your computer and use it in GitHub Desktop.
Text Algebra in python
#!/usr/bin/env python
# License: CC-0
"""text_algebra: a stack-based RPN line-matching toolkit
Usage: text_algebra [--repl|-f your-program.txt|your inline program]
--repl starts a REPL to work interactively.
-f your-program.txt will load a (possibly multi-line) program
Definitions:
Operators: U N D S X C F P R (see examples)
Operands: anything else will be considered as filenames and read as
unordered sets to the stack
Output: push on the stack.
End of program: at the end of a program, the lines contained in the last element of
the stack are printed, if there are any.
Comments: anything after a '#' symbol
this_is_not_a_comment.txt #This is a comment
Examples:
phone-numbers-mary.txt phone-numbers-luke.txt U # U for Union
This will output the union of all lines in phone-numbers-mary.txt and phone-numbers-luke.txt
pn-mary.txt pn-joe.txt N # N for iNtersection
This will output the lines in common in these two files
pn-mary.txt pn-gustave.txt D # D for Difference
This will output the lines that appear in pn-mary.txt but not in pn-gustave.txt
pn-robert.txt pn-henry.txt S # S for substring
This will output all lines in pn-robert.txt that are substrings of lines in pn-henry.txt
pn-alice.txt pn-bob.txt X # X for XOR
This will output all lines that are in exactly one of pn-alice.txt or pn-bob.txt
pn-charlie.txt C # C for Count
This will push pn-random.txt on the stack and display its length (without pushing it onto the stack)
pn-danish.txt F 'john.*ghost.*' # F for Find
This will output all lines in pn-danish.txt that match the regex /john.*ghost.*/
pn-danish.txt pn-echo.txt P # P for Peek
This will push pn-danish.txt and then pn-echo.txt and print the entire stack without
pn-danish.txt pn-echo.txt pn-foxtrot.txt U X P C R # R for reset
This will
- load the three files on the stack
- union pn-echo.txt and pn-foxtrot.txt
- run the symmetric difference of that union with pn-danish.txt
- peek at the entire stack
- count the number of items on the stack (should be 1)
- reset the stack afterwards
REPL help:
Use the following REPL commands to adjust its functions:
quiet will toggle outputting the last item in the stack after each line is read
exit will leave the REPL
help will show this help
Notes:
F is a pseudo-operator that requires a complement (the regex) to it coming *after* it.
The regex is not and cannot be read from the stack. It's a static element at the time of
writing the program. Additionally because I'm a lazy thing the regex must be on the same
line as F due to the way tokens are processed.
"""
import sys
import re
import os
import readline
import rlcompleter
import atexit
import glob
HISTORY_FILE = os.path.expanduser("~/.text_algebra_history")
def init_readline():
try:
readline.read_history_file(HISTORY_FILE)
except FileNotFoundError:
pass
atexit.register(readline.write_history_file, HISTORY_FILE)
def file_completer(text, state):
matches = glob.glob(text + "*")
return matches[state] if state < len(matches) else None
def show_result(stack):
if stack:
for line in sorted(stack[-1]):
print(line)
def start_repl(stack):
init_readline()
readline.set_completer(file_completer)
readline.parse_and_bind("tab: complete")
print("# text-algebra REPL")
quiet = False
while True:
try:
line = input(">>> ")
if line.strip() == "exit":
break
elif line.strip() == "help":
print(__doc__)
continue
elif line.strip() == "quiet":
quiet = not quiet
continue
# Evaluate the input safely
machine_process(line, stack)
if not quiet:
print(stack[-1])
except (EOFError, KeyboardInterrupt):
print("\nExiting.")
break
except Exception as e:
print(f"\nGot an exception: {e}. Ignoring it and continuing as-is.")
continue
def read_file_lines(filename):
"""Read a file and return a set of trimmed, deduplicated lines."""
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
return set(line.strip() for line in lines if line.strip())
def union(a, b):
return a | b
def intersection(a, b):
return a & b
def difference(a, b):
return a - b
def substring_match(a, b):
return set(line_a for line_a in a if any(line_a in line_b for line_b in b))
def symmetric_difference(a, b):
return a ^ b
def count(a):
print(len(a))
return a
def regex_filter(a, pattern):
regex = re.compile(pattern)
return set(line for line in a if regex.search(line))
def machine_process(line_of_tokens, stack):
args = line_of_tokens.split('#', 1)[0].split()
i = 0
while i < len(args):
token = args[i]
if token in {"U", "N", "D", "S", "X"}:
if len(stack) < 2:
print(f"Error: Operator '{token}' requires two operands.")
return
b = stack.pop()
a = stack.pop()
if token == "U":
stack.append(union(a, b))
elif token == "N":
stack.append(intersection(a, b))
elif token == "D":
stack.append(difference(a, b))
elif token == "S":
stack.append(substring_match(a, b))
elif token == "X":
stack.append(symmetric_difference(a, b))
elif token == "C":
if not stack:
print("Error: Operator 'C' requires one operand.")
return
top = stack[-1]
count(top)
elif token == "F":
if i + 1 >= len(args):
print("Error: Operator 'F' requires a regex pattern.")
return
pattern = args[i + 1]
if not stack:
print("Error: Operator 'F' requires one operand.")
return
top = stack.pop()
stack.append(regex_filter(top, pattern))
i += 1 # Skip the pattern argument
elif token in {"P"}:
print("peeking at stack of length", len(stack))
print(stack)
elif token in {"R"}:
print("resetting stack")
stack.clear()
else:
try:
lines = read_file_lines(token)
stack.append(lines)
except Exception as e:
print(f"Error reading file '{token}': {e}")
return
i += 1
def execute_program(stack, program_name):
prog = None
with open(program_name, 'r', encoding='utf-8') as fh:
prog = fh.readlines()
for line in prog:
machine_process(line, stack)
def main():
args = sys.argv[1:]
stack = []
if args[0] == "--repl":
start_repl(stack)
elif args[0] == "-f":
execute_program(stack, args[1])
else:
machine_process(" ".join(args), stack)
show_result(stack)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment