Last active
September 19, 2025 15:34
-
-
Save matchaxnb/61a249c80a2fd85754e17a7d935267f4 to your computer and use it in GitHub Desktop.
Text Algebra in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # License: CC-0 | |
| """text_algebra: a stack-based RPN line-matching toolkit | |
| Usage: text_algebra [--repl|-f your-program.txt|your inline program] | |
| --repl starts a REPL to work interactively. | |
| -f your-program.txt will load a (possibly multi-line) program | |
| Definitions: | |
| Operators: U N D S X C F P R (see examples) | |
| Operands: anything else will be considered as filenames and read as | |
| unordered sets to the stack | |
| Output: push on the stack. | |
| End of program: at the end of a program, the lines contained in the last element of | |
| the stack are printed, if there are any. | |
| Comments: anything after a '#' symbol | |
| this_is_not_a_comment.txt #This is a comment | |
| Examples: | |
| phone-numbers-mary.txt phone-numbers-luke.txt U # U for Union | |
| This will output the union of all lines in phone-numbers-mary.txt and phone-numbers-luke.txt | |
| pn-mary.txt pn-joe.txt N # N for iNtersection | |
| This will output the lines in common in these two files | |
| pn-mary.txt pn-gustave.txt D # D for Difference | |
| This will output the lines that appear in pn-mary.txt but not in pn-gustave.txt | |
| pn-robert.txt pn-henry.txt S # S for substring | |
| This will output all lines in pn-robert.txt that are substrings of lines in pn-henry.txt | |
| pn-alice.txt pn-bob.txt X # X for XOR | |
| This will output all lines that are in exactly one of pn-alice.txt or pn-bob.txt | |
| pn-charlie.txt C # C for Count | |
| This will push pn-random.txt on the stack and display its length (without pushing it onto the stack) | |
| pn-danish.txt F 'john.*ghost.*' # F for Find | |
| This will output all lines in pn-danish.txt that match the regex /john.*ghost.*/ | |
| pn-danish.txt pn-echo.txt P # P for Peek | |
| This will push pn-danish.txt and then pn-echo.txt and print the entire stack without | |
| pn-danish.txt pn-echo.txt pn-foxtrot.txt U X P C R # R for reset | |
| This will | |
| - load the three files on the stack | |
| - union pn-echo.txt and pn-foxtrot.txt | |
| - run the symmetric difference of that union with pn-danish.txt | |
| - peek at the entire stack | |
| - count the number of items on the stack (should be 1) | |
| - reset the stack afterwards | |
| REPL help: | |
| Use the following REPL commands to adjust its functions: | |
| quiet will toggle outputting the last item in the stack after each line is read | |
| exit will leave the REPL | |
| help will show this help | |
| Notes: | |
| F is a pseudo-operator that requires a complement (the regex) to it coming *after* it. | |
| The regex is not and cannot be read from the stack. It's a static element at the time of | |
| writing the program. Additionally because I'm a lazy thing the regex must be on the same | |
| line as F due to the way tokens are processed. | |
| """ | |
| import sys | |
| import re | |
| import os | |
| import readline | |
| import rlcompleter | |
| import atexit | |
| import glob | |
| HISTORY_FILE = os.path.expanduser("~/.text_algebra_history") | |
| def init_readline(): | |
| try: | |
| readline.read_history_file(HISTORY_FILE) | |
| except FileNotFoundError: | |
| pass | |
| atexit.register(readline.write_history_file, HISTORY_FILE) | |
| def file_completer(text, state): | |
| matches = glob.glob(text + "*") | |
| return matches[state] if state < len(matches) else None | |
| def show_result(stack): | |
| if stack: | |
| for line in sorted(stack[-1]): | |
| print(line) | |
| def start_repl(stack): | |
| init_readline() | |
| readline.set_completer(file_completer) | |
| readline.parse_and_bind("tab: complete") | |
| print("# text-algebra REPL") | |
| quiet = False | |
| while True: | |
| try: | |
| line = input(">>> ") | |
| if line.strip() == "exit": | |
| break | |
| elif line.strip() == "help": | |
| print(__doc__) | |
| continue | |
| elif line.strip() == "quiet": | |
| quiet = not quiet | |
| continue | |
| # Evaluate the input safely | |
| machine_process(line, stack) | |
| if not quiet: | |
| print(stack[-1]) | |
| except (EOFError, KeyboardInterrupt): | |
| print("\nExiting.") | |
| break | |
| except Exception as e: | |
| print(f"\nGot an exception: {e}. Ignoring it and continuing as-is.") | |
| continue | |
| def read_file_lines(filename): | |
| """Read a file and return a set of trimmed, deduplicated lines.""" | |
| with open(filename, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| return set(line.strip() for line in lines if line.strip()) | |
| def union(a, b): | |
| return a | b | |
| def intersection(a, b): | |
| return a & b | |
| def difference(a, b): | |
| return a - b | |
| def substring_match(a, b): | |
| return set(line_a for line_a in a if any(line_a in line_b for line_b in b)) | |
| def symmetric_difference(a, b): | |
| return a ^ b | |
| def count(a): | |
| print(len(a)) | |
| return a | |
| def regex_filter(a, pattern): | |
| regex = re.compile(pattern) | |
| return set(line for line in a if regex.search(line)) | |
| def machine_process(line_of_tokens, stack): | |
| args = line_of_tokens.split('#', 1)[0].split() | |
| i = 0 | |
| while i < len(args): | |
| token = args[i] | |
| if token in {"U", "N", "D", "S", "X"}: | |
| if len(stack) < 2: | |
| print(f"Error: Operator '{token}' requires two operands.") | |
| return | |
| b = stack.pop() | |
| a = stack.pop() | |
| if token == "U": | |
| stack.append(union(a, b)) | |
| elif token == "N": | |
| stack.append(intersection(a, b)) | |
| elif token == "D": | |
| stack.append(difference(a, b)) | |
| elif token == "S": | |
| stack.append(substring_match(a, b)) | |
| elif token == "X": | |
| stack.append(symmetric_difference(a, b)) | |
| elif token == "C": | |
| if not stack: | |
| print("Error: Operator 'C' requires one operand.") | |
| return | |
| top = stack[-1] | |
| count(top) | |
| elif token == "F": | |
| if i + 1 >= len(args): | |
| print("Error: Operator 'F' requires a regex pattern.") | |
| return | |
| pattern = args[i + 1] | |
| if not stack: | |
| print("Error: Operator 'F' requires one operand.") | |
| return | |
| top = stack.pop() | |
| stack.append(regex_filter(top, pattern)) | |
| i += 1 # Skip the pattern argument | |
| elif token in {"P"}: | |
| print("peeking at stack of length", len(stack)) | |
| print(stack) | |
| elif token in {"R"}: | |
| print("resetting stack") | |
| stack.clear() | |
| else: | |
| try: | |
| lines = read_file_lines(token) | |
| stack.append(lines) | |
| except Exception as e: | |
| print(f"Error reading file '{token}': {e}") | |
| return | |
| i += 1 | |
| def execute_program(stack, program_name): | |
| prog = None | |
| with open(program_name, 'r', encoding='utf-8') as fh: | |
| prog = fh.readlines() | |
| for line in prog: | |
| machine_process(line, stack) | |
| def main(): | |
| args = sys.argv[1:] | |
| stack = [] | |
| if args[0] == "--repl": | |
| start_repl(stack) | |
| elif args[0] == "-f": | |
| execute_program(stack, args[1]) | |
| else: | |
| machine_process(" ".join(args), stack) | |
| show_result(stack) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment