Created
January 23, 2026 20:28
-
-
Save kanzure/8527aaabe81a88f20f1c0f511597e63d to your computer and use it in GitHub Desktop.
docreader: a tool for LLMs for reading and navigating markdown documents
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script --with=click | |
| """ | |
| docreader - A tool for reading and navigating Markdown documents. | |
| This script provides efficient access to specific sections within Markdown files, | |
| making it particularly useful when working with large documentation. | |
| Features: | |
| - Parse and display a table of contents for any Markdown file | |
| - Extract specific sections by name or slug (case-insensitive) | |
| - Handle multiple section extractions in a single command | |
| - Auto-generate TOC when one doesn't exist | |
| Token Efficiency for LLMs: | |
| When working with Large Language Models, context window limits are a critical | |
| constraint. This tool helps reduce token consumption by: | |
| 1. Selective Extraction: Retrieve only the specific sections you need instead | |
| of feeding entire documentation files into the context. | |
| 2. Quick Navigation: Preview the TOC to identify relevant sections before | |
| requesting their full content. | |
| 3. Batch Processing: Extract multiple related sections in a single call, | |
| reducing the number of separate requests needed. | |
| Example usage for LLM workflows: | |
| # First, see what's available | |
| ./docreader README.md | |
| # Then, extract only what you need | |
| ./docreader README.md "API Reference" "Authentication" | |
| :author: Bryan Bishop <[email protected]> | |
| :date: 2026-01-23 | |
| """ | |
| import re | |
| import sys | |
| import unittest | |
| from pathlib import Path | |
| import click | |
| def parse_markdown_headers(content: str) -> list[tuple[int, str, str]]: | |
| """Parse markdown headers from content. | |
| Returns list of tuples: (level, title, anchor_id) | |
| """ | |
| headers = [] | |
| for line in content.split('\n'): | |
| match = re.match(r'^(#{1,6})\s+(.+)$', line) | |
| if match: | |
| level = len(match.group(1)) | |
| title = match.group(2).strip() | |
| # Create anchor ID by lowercasing and replacing spaces with hyphens | |
| anchor = re.sub(r'[^\w\s-]', '', title.lower()).strip().replace(' ', '-') | |
| headers.append((level, title, anchor)) | |
| return headers | |
| def find_existing_toc(content: str) -> tuple[int | None, int | None]: | |
| """Find existing table of contents section. | |
| Returns (toc_start, toc_end) line indices or (None, None) if not found. | |
| """ | |
| lines = content.split('\n') | |
| toc_start = None | |
| toc_end = None | |
| for i, line in enumerate(lines): | |
| # Look for a TOC header | |
| if re.match(r'^#+\s*Table\s+of\s+Contents', line, re.IGNORECASE): | |
| toc_start = i | |
| # Find where TOC ends (next header or end of file) | |
| for j in range(i + 1, len(lines)): | |
| if lines[j].startswith('#') and lines[j].strip(): | |
| toc_end = j | |
| break | |
| if toc_end is None: | |
| toc_end = len(lines) | |
| break | |
| return (toc_start, toc_end) | |
| def generate_toc(headers: list[tuple[int, str, str]]) -> str: | |
| """Generate a table of contents from headers.""" | |
| toc_lines = ["## Table of Contents\n"] | |
| for level, title, anchor in headers: | |
| indent = " " * (level - 1) | |
| toc_lines.append(f"{indent}- [{title}](#{anchor})") | |
| return "\n".join(toc_lines) | |
| def find_matching_sections(content: str, section_name: str) -> list[tuple[int, int, str]]: | |
| """Find all sections matching the given name (by title or slug). | |
| Returns list of tuples: (start_line, end_line, title) | |
| """ | |
| lines = content.split('\n') | |
| matches = [] | |
| # Generate slug from section_name for comparison | |
| search_slug = re.sub(r'[^\w\s-]', '', section_name.lower()).strip().replace(' ', '-') | |
| for i, line in enumerate(lines): | |
| match = re.match(r'^(#+)\s+(.+)$', line) | |
| if match: | |
| level = len(match.group(1)) | |
| title = match.group(2).strip() | |
| # Generate slug from title | |
| title_slug = re.sub(r'[^\w\s-]', '', title.lower()).strip().replace(' ', '-') | |
| # Match by exact title (case-insensitive) or by slug | |
| if title.lower() == section_name.lower() or title_slug == search_slug: | |
| # Find where section ends (next header of same or higher level) | |
| section_end = None | |
| for j in range(i + 1, len(lines)): | |
| next_match = re.match(r'^(#+)\s+', lines[j]) | |
| if next_match: | |
| next_level = len(next_match.group(1)) | |
| if next_level <= level: | |
| section_end = j | |
| break | |
| if section_end is None: | |
| section_end = len(lines) | |
| matches.append((i, section_end, title)) | |
| return matches | |
| def extract_section(content: str, section_name: str) -> str: | |
| """Extract a specific section from the markdown content. | |
| Matches by exact title (case-insensitive) or by slug. | |
| Returns the section content including its header. | |
| Exits with error if the section name is ambiguous. | |
| """ | |
| matches = find_matching_sections(content, section_name) | |
| if not matches: | |
| return f"Section '{section_name}' not found." | |
| if len(matches) > 1: | |
| click.echo(f"Error: '{section_name}' is ambiguous. Matches multiple sections:", err=True) | |
| for start, end, title in matches: | |
| # Use 1-based line numbers for user-friendliness | |
| click.echo(f" - {title} (lines {start + 1}-{end})", err=True) | |
| sys.exit(1) | |
| start, end, _ = matches[0] | |
| lines = content.split('\n') | |
| return '\n'.join(lines[start:end]) | |
| def get_opening_section(content: str) -> str: | |
| """Get the opening section including TOC if it exists.""" | |
| toc_start, toc_end = find_existing_toc(content) | |
| if toc_start is not None: | |
| # Return content up to end of TOC | |
| lines = content.split('\n') | |
| return '\n'.join(lines[:toc_end]) | |
| else: | |
| # No TOC found, generate one and return it | |
| headers = parse_markdown_headers(content) | |
| toc = generate_toc(headers) | |
| return toc | |
| class DefaultGroup(click.Group): | |
| """A Click group that invokes a default command if no subcommand is given.""" | |
| def __init__(self, *args, default_cmd: str = None, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.default_cmd = default_cmd | |
| def parse_args(self, ctx, args): | |
| # If args exist and the first arg is not a known command, prepend the default | |
| if args and args[0] not in self.commands and self.default_cmd: | |
| args = [self.default_cmd] + list(args) | |
| return super().parse_args(ctx, args) | |
| @click.group(cls=DefaultGroup, default_cmd='read', invoke_without_command=True) | |
| @click.pass_context | |
| def main(ctx: click.Context) -> None: | |
| """DocReader - A tool for reading and navigating Markdown documents. | |
| Use 'docreader FILENAME [SECTIONS...]' to read documents, | |
| or 'docreader test' to run the test suite. | |
| """ | |
| if ctx.invoked_subcommand is None: | |
| click.echo(ctx.get_help()) | |
| @main.command('read') | |
| @click.argument('filename', type=click.Path(exists=True)) | |
| @click.argument('sections', nargs=-1, required=False) | |
| def read_command(filename: str, sections: tuple[str, ...]) -> None: | |
| """Read and navigate Markdown documents. | |
| FILENAME: Path to the markdown file to read. | |
| SECTIONS: Optional section names to extract. If not provided, displays the table of contents. | |
| """ | |
| file_path = Path(filename) | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| if not sections: | |
| # Display TOC or opening section | |
| click.echo(get_opening_section(content)) | |
| # Add usage help | |
| click.echo("\n" + "-" * 40) | |
| click.echo("\nTo read a specific section, use the section name or its slug:") | |
| click.echo(f" docreader {filename} \"Section Name Goes Here\"") | |
| click.echo(f" docreader {filename} <section-slug>") | |
| click.echo(f"\nExample:") | |
| click.echo(f" docreader {filename} my-section-slug-here") | |
| else: | |
| # Extract and display each requested section | |
| for section in sections: | |
| click.echo(extract_section(content, section)) | |
| if len(sections) > 1: | |
| click.echo("\n" + "-" * 80 + "\n") | |
| @main.command('test') | |
| @click.option('-v', '--verbose', is_flag=True, help='Verbose test output') | |
| def test_command(verbose: bool) -> None: | |
| """Run the test suite.""" | |
| verbosity = 2 if verbose else 1 | |
| loader = unittest.TestLoader() | |
| suite = loader.loadTestsFromTestCase(TestDocReader) | |
| runner = unittest.TextTestRunner(verbosity=verbosity) | |
| result = runner.run(suite) | |
| sys.exit(0 if result.wasSuccessful() else 1) | |
| class TestDocReader(unittest.TestCase): | |
| """Unit tests for DocReader functions.""" | |
| def test_parse_markdown_headers(self): | |
| """Test parsing markdown headers.""" | |
| content = """# Title 1 | |
| Some content | |
| ## Title 2 | |
| More content | |
| ### Title 3 | |
| Even more content | |
| """ | |
| headers = parse_markdown_headers(content) | |
| self.assertEqual(len(headers), 3) | |
| self.assertEqual(headers[0], (1, "Title 1", "title-1")) | |
| self.assertEqual(headers[1], (2, "Title 2", "title-2")) | |
| self.assertEqual(headers[2], (3, "Title 3", "title-3")) | |
| def test_parse_markdown_headers_with_special_chars(self): | |
| """Test parsing headers with special characters.""" | |
| content = """# API Conventions | |
| ## Table of Contents | |
| ### Working with LLMs | |
| """ | |
| headers = parse_markdown_headers(content) | |
| self.assertEqual(len(headers), 3) | |
| self.assertEqual(headers[0], (1, "API Conventions", "api-conventions")) | |
| self.assertEqual(headers[1], (2, "Table of Contents", "table-of-contents")) | |
| self.assertEqual(headers[2], (3, "Working with LLMs", "working-with-llms")) | |
| def test_find_existing_toc(self): | |
| """Test finding existing table of contents.""" | |
| content = """# Title | |
| ## Table of Contents | |
| - [Section 1](#section-1) | |
| - [Section 2](#section-2) | |
| ## Section 1 | |
| Content here | |
| """ | |
| toc_start, toc_end = find_existing_toc(content) | |
| self.assertEqual(toc_start, 2) | |
| self.assertEqual(toc_end, 7) | |
| def test_find_existing_toc_not_found(self): | |
| """Test when TOC doesn't exist.""" | |
| content = """# Title | |
| ## Section 1 | |
| Content here | |
| """ | |
| toc_start, toc_end = find_existing_toc(content) | |
| self.assertIsNone(toc_start) | |
| self.assertIsNone(toc_end) | |
| def test_generate_toc(self): | |
| """Test generating table of contents.""" | |
| headers = [ | |
| (1, "Title 1", "title-1"), | |
| (2, "Title 2", "title-2"), | |
| (3, "Title 3", "title-3"), | |
| ] | |
| toc = generate_toc(headers) | |
| self.assertIn("## Table of Contents", toc) | |
| self.assertIn("- [Title 1](#title-1)", toc) | |
| self.assertIn(" - [Title 2](#title-2)", toc) | |
| self.assertIn(" - [Title 3](#title-3)", toc) | |
| def test_extract_section(self): | |
| """Test extracting a specific section.""" | |
| content = """# Title | |
| ## Section 1 | |
| Content of section 1 | |
| ## Section 2 | |
| Content of section 2 | |
| """ | |
| section = extract_section(content, "Section 1") | |
| self.assertIn("## Section 1", section) | |
| self.assertIn("Content of section 1", section) | |
| self.assertNotIn("Section 2", section) | |
| def test_extract_section_not_found(self): | |
| """Test extracting a non-existent section.""" | |
| content = """# Title | |
| ## Section 1 | |
| Content | |
| """ | |
| section = extract_section(content, "NonExistent") | |
| self.assertEqual(section, "Section 'NonExistent' not found.") | |
| def test_extract_section_case_insensitive(self): | |
| """Test that section extraction is case-insensitive.""" | |
| content = """# Title | |
| ## My Section | |
| Content | |
| """ | |
| section = extract_section(content, "my section") | |
| self.assertIn("## My Section", section) | |
| def test_get_opening_section_with_toc(self): | |
| """Test getting opening section when TOC exists.""" | |
| content = """# Title | |
| Intro text | |
| ## Table of Contents | |
| - [Section 1](#section-1) | |
| ## Section 1 | |
| Content | |
| """ | |
| opening = get_opening_section(content) | |
| self.assertIn("# Title", opening) | |
| self.assertIn("## Table of Contents", opening) | |
| self.assertNotIn("## Section 1", opening) | |
| def test_get_opening_section_without_toc(self): | |
| """Test getting opening section when TOC doesn't exist.""" | |
| content = """# Title | |
| Intro text | |
| ## Section 1 | |
| Content | |
| """ | |
| opening = get_opening_section(content) | |
| self.assertIn("## Table of Contents", opening) | |
| self.assertIn("- [Title](#title)", opening) | |
| def test_extract_section_by_slug(self): | |
| """Test extracting a section using its slug.""" | |
| content = """# Title | |
| ## API Conventions | |
| Content about API conventions | |
| ## Other Section | |
| Other content | |
| """ | |
| section = extract_section(content, "api-conventions") | |
| self.assertIn("## API Conventions", section) | |
| self.assertIn("Content about API conventions", section) | |
| self.assertNotIn("Other Section", section) | |
| def test_extract_section_by_slug_with_special_chars(self): | |
| """Test extracting a section with special characters using slug.""" | |
| content = """# Title | |
| ## Working with LLMs | |
| Content about LLMs | |
| ## Next Section | |
| Next content | |
| """ | |
| section = extract_section(content, "working-with-llms") | |
| self.assertIn("## Working with LLMs", section) | |
| self.assertIn("Content about LLMs", section) | |
| def test_extract_section_exact_title_with_quotes(self): | |
| """Test extracting a section using exact quoted title.""" | |
| content = """# Title | |
| ## Quoted Title | |
| Content here | |
| ## Another Section | |
| More content | |
| """ | |
| section = extract_section(content, "Quoted Title") | |
| self.assertIn("## Quoted Title", section) | |
| self.assertIn("Content here", section) | |
| def test_find_matching_sections_multiple_matches(self): | |
| """Test finding multiple sections with same slug.""" | |
| content = """# Title | |
| ## My Section | |
| First content | |
| ## Another Header | |
| Middle content | |
| ## My Section | |
| Second content | |
| """ | |
| matches = find_matching_sections(content, "my-section") | |
| self.assertEqual(len(matches), 2) | |
| self.assertEqual(matches[0][2], "My Section") | |
| self.assertEqual(matches[1][2], "My Section") | |
| def test_extract_section_ambiguous_exits(self): | |
| """Test that ambiguous section name causes exit with line numbers.""" | |
| content = """# Title | |
| ## My Section | |
| First content | |
| ## My Section | |
| Second content | |
| """ | |
| # Capture stderr to verify line numbers are included | |
| import io | |
| from unittest.mock import patch | |
| with patch('sys.stderr', new_callable=io.StringIO) as mock_stderr: | |
| with self.assertRaises(SystemExit) as cm: | |
| extract_section(content, "my-section") | |
| self.assertEqual(cm.exception.code, 1) | |
| error_output = mock_stderr.getvalue() | |
| # Verify line numbers are present in the error message | |
| self.assertIn("lines", error_output) | |
| self.assertIn("My Section", error_output) | |
| def test_extract_section_slug_vs_title_same_match(self): | |
| """Test that slug and title matching the same section works.""" | |
| content = """# Title | |
| ## Hello World | |
| Content here | |
| """ | |
| # Both should match the same section, no ambiguity | |
| section_by_title = extract_section(content, "Hello World") | |
| self.assertIn("## Hello World", section_by_title) | |
| section_by_slug = extract_section(content, "hello-world") | |
| self.assertIn("## Hello World", section_by_slug) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment