Skip to content

Instantly share code, notes, and snippets.

@kanzure
Created January 23, 2026 20:28
Show Gist options
  • Select an option

  • Save kanzure/8527aaabe81a88f20f1c0f511597e63d to your computer and use it in GitHub Desktop.

Select an option

Save kanzure/8527aaabe81a88f20f1c0f511597e63d to your computer and use it in GitHub Desktop.
docreader: a tool for LLMs for reading and navigating markdown documents
#!/usr/bin/env -S uv run --script --with=click
"""
docreader - A tool for reading and navigating Markdown documents.
This script provides efficient access to specific sections within Markdown files,
making it particularly useful when working with large documentation.
Features:
- Parse and display a table of contents for any Markdown file
- Extract specific sections by name or slug (case-insensitive)
- Handle multiple section extractions in a single command
- Auto-generate TOC when one doesn't exist
Token Efficiency for LLMs:
When working with Large Language Models, context window limits are a critical
constraint. This tool helps reduce token consumption by:
1. Selective Extraction: Retrieve only the specific sections you need instead
of feeding entire documentation files into the context.
2. Quick Navigation: Preview the TOC to identify relevant sections before
requesting their full content.
3. Batch Processing: Extract multiple related sections in a single call,
reducing the number of separate requests needed.
Example usage for LLM workflows:
# First, see what's available
./docreader README.md
# Then, extract only what you need
./docreader README.md "API Reference" "Authentication"
:author: Bryan Bishop <[email protected]>
:date: 2026-01-23
"""
import re
import sys
import unittest
from pathlib import Path
import click
def parse_markdown_headers(content: str) -> list[tuple[int, str, str]]:
"""Parse markdown headers from content.
Returns list of tuples: (level, title, anchor_id)
"""
headers = []
for line in content.split('\n'):
match = re.match(r'^(#{1,6})\s+(.+)$', line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
# Create anchor ID by lowercasing and replacing spaces with hyphens
anchor = re.sub(r'[^\w\s-]', '', title.lower()).strip().replace(' ', '-')
headers.append((level, title, anchor))
return headers
def find_existing_toc(content: str) -> tuple[int | None, int | None]:
"""Find existing table of contents section.
Returns (toc_start, toc_end) line indices or (None, None) if not found.
"""
lines = content.split('\n')
toc_start = None
toc_end = None
for i, line in enumerate(lines):
# Look for a TOC header
if re.match(r'^#+\s*Table\s+of\s+Contents', line, re.IGNORECASE):
toc_start = i
# Find where TOC ends (next header or end of file)
for j in range(i + 1, len(lines)):
if lines[j].startswith('#') and lines[j].strip():
toc_end = j
break
if toc_end is None:
toc_end = len(lines)
break
return (toc_start, toc_end)
def generate_toc(headers: list[tuple[int, str, str]]) -> str:
"""Generate a table of contents from headers."""
toc_lines = ["## Table of Contents\n"]
for level, title, anchor in headers:
indent = " " * (level - 1)
toc_lines.append(f"{indent}- [{title}](#{anchor})")
return "\n".join(toc_lines)
def find_matching_sections(content: str, section_name: str) -> list[tuple[int, int, str]]:
"""Find all sections matching the given name (by title or slug).
Returns list of tuples: (start_line, end_line, title)
"""
lines = content.split('\n')
matches = []
# Generate slug from section_name for comparison
search_slug = re.sub(r'[^\w\s-]', '', section_name.lower()).strip().replace(' ', '-')
for i, line in enumerate(lines):
match = re.match(r'^(#+)\s+(.+)$', line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
# Generate slug from title
title_slug = re.sub(r'[^\w\s-]', '', title.lower()).strip().replace(' ', '-')
# Match by exact title (case-insensitive) or by slug
if title.lower() == section_name.lower() or title_slug == search_slug:
# Find where section ends (next header of same or higher level)
section_end = None
for j in range(i + 1, len(lines)):
next_match = re.match(r'^(#+)\s+', lines[j])
if next_match:
next_level = len(next_match.group(1))
if next_level <= level:
section_end = j
break
if section_end is None:
section_end = len(lines)
matches.append((i, section_end, title))
return matches
def extract_section(content: str, section_name: str) -> str:
"""Extract a specific section from the markdown content.
Matches by exact title (case-insensitive) or by slug.
Returns the section content including its header.
Exits with error if the section name is ambiguous.
"""
matches = find_matching_sections(content, section_name)
if not matches:
return f"Section '{section_name}' not found."
if len(matches) > 1:
click.echo(f"Error: '{section_name}' is ambiguous. Matches multiple sections:", err=True)
for start, end, title in matches:
# Use 1-based line numbers for user-friendliness
click.echo(f" - {title} (lines {start + 1}-{end})", err=True)
sys.exit(1)
start, end, _ = matches[0]
lines = content.split('\n')
return '\n'.join(lines[start:end])
def get_opening_section(content: str) -> str:
"""Get the opening section including TOC if it exists."""
toc_start, toc_end = find_existing_toc(content)
if toc_start is not None:
# Return content up to end of TOC
lines = content.split('\n')
return '\n'.join(lines[:toc_end])
else:
# No TOC found, generate one and return it
headers = parse_markdown_headers(content)
toc = generate_toc(headers)
return toc
class DefaultGroup(click.Group):
"""A Click group that invokes a default command if no subcommand is given."""
def __init__(self, *args, default_cmd: str = None, **kwargs):
super().__init__(*args, **kwargs)
self.default_cmd = default_cmd
def parse_args(self, ctx, args):
# If args exist and the first arg is not a known command, prepend the default
if args and args[0] not in self.commands and self.default_cmd:
args = [self.default_cmd] + list(args)
return super().parse_args(ctx, args)
@click.group(cls=DefaultGroup, default_cmd='read', invoke_without_command=True)
@click.pass_context
def main(ctx: click.Context) -> None:
"""DocReader - A tool for reading and navigating Markdown documents.
Use 'docreader FILENAME [SECTIONS...]' to read documents,
or 'docreader test' to run the test suite.
"""
if ctx.invoked_subcommand is None:
click.echo(ctx.get_help())
@main.command('read')
@click.argument('filename', type=click.Path(exists=True))
@click.argument('sections', nargs=-1, required=False)
def read_command(filename: str, sections: tuple[str, ...]) -> None:
"""Read and navigate Markdown documents.
FILENAME: Path to the markdown file to read.
SECTIONS: Optional section names to extract. If not provided, displays the table of contents.
"""
file_path = Path(filename)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
if not sections:
# Display TOC or opening section
click.echo(get_opening_section(content))
# Add usage help
click.echo("\n" + "-" * 40)
click.echo("\nTo read a specific section, use the section name or its slug:")
click.echo(f" docreader {filename} \"Section Name Goes Here\"")
click.echo(f" docreader {filename} <section-slug>")
click.echo(f"\nExample:")
click.echo(f" docreader {filename} my-section-slug-here")
else:
# Extract and display each requested section
for section in sections:
click.echo(extract_section(content, section))
if len(sections) > 1:
click.echo("\n" + "-" * 80 + "\n")
@main.command('test')
@click.option('-v', '--verbose', is_flag=True, help='Verbose test output')
def test_command(verbose: bool) -> None:
"""Run the test suite."""
verbosity = 2 if verbose else 1
loader = unittest.TestLoader()
suite = loader.loadTestsFromTestCase(TestDocReader)
runner = unittest.TextTestRunner(verbosity=verbosity)
result = runner.run(suite)
sys.exit(0 if result.wasSuccessful() else 1)
class TestDocReader(unittest.TestCase):
"""Unit tests for DocReader functions."""
def test_parse_markdown_headers(self):
"""Test parsing markdown headers."""
content = """# Title 1
Some content
## Title 2
More content
### Title 3
Even more content
"""
headers = parse_markdown_headers(content)
self.assertEqual(len(headers), 3)
self.assertEqual(headers[0], (1, "Title 1", "title-1"))
self.assertEqual(headers[1], (2, "Title 2", "title-2"))
self.assertEqual(headers[2], (3, "Title 3", "title-3"))
def test_parse_markdown_headers_with_special_chars(self):
"""Test parsing headers with special characters."""
content = """# API Conventions
## Table of Contents
### Working with LLMs
"""
headers = parse_markdown_headers(content)
self.assertEqual(len(headers), 3)
self.assertEqual(headers[0], (1, "API Conventions", "api-conventions"))
self.assertEqual(headers[1], (2, "Table of Contents", "table-of-contents"))
self.assertEqual(headers[2], (3, "Working with LLMs", "working-with-llms"))
def test_find_existing_toc(self):
"""Test finding existing table of contents."""
content = """# Title
## Table of Contents
- [Section 1](#section-1)
- [Section 2](#section-2)
## Section 1
Content here
"""
toc_start, toc_end = find_existing_toc(content)
self.assertEqual(toc_start, 2)
self.assertEqual(toc_end, 7)
def test_find_existing_toc_not_found(self):
"""Test when TOC doesn't exist."""
content = """# Title
## Section 1
Content here
"""
toc_start, toc_end = find_existing_toc(content)
self.assertIsNone(toc_start)
self.assertIsNone(toc_end)
def test_generate_toc(self):
"""Test generating table of contents."""
headers = [
(1, "Title 1", "title-1"),
(2, "Title 2", "title-2"),
(3, "Title 3", "title-3"),
]
toc = generate_toc(headers)
self.assertIn("## Table of Contents", toc)
self.assertIn("- [Title 1](#title-1)", toc)
self.assertIn(" - [Title 2](#title-2)", toc)
self.assertIn(" - [Title 3](#title-3)", toc)
def test_extract_section(self):
"""Test extracting a specific section."""
content = """# Title
## Section 1
Content of section 1
## Section 2
Content of section 2
"""
section = extract_section(content, "Section 1")
self.assertIn("## Section 1", section)
self.assertIn("Content of section 1", section)
self.assertNotIn("Section 2", section)
def test_extract_section_not_found(self):
"""Test extracting a non-existent section."""
content = """# Title
## Section 1
Content
"""
section = extract_section(content, "NonExistent")
self.assertEqual(section, "Section 'NonExistent' not found.")
def test_extract_section_case_insensitive(self):
"""Test that section extraction is case-insensitive."""
content = """# Title
## My Section
Content
"""
section = extract_section(content, "my section")
self.assertIn("## My Section", section)
def test_get_opening_section_with_toc(self):
"""Test getting opening section when TOC exists."""
content = """# Title
Intro text
## Table of Contents
- [Section 1](#section-1)
## Section 1
Content
"""
opening = get_opening_section(content)
self.assertIn("# Title", opening)
self.assertIn("## Table of Contents", opening)
self.assertNotIn("## Section 1", opening)
def test_get_opening_section_without_toc(self):
"""Test getting opening section when TOC doesn't exist."""
content = """# Title
Intro text
## Section 1
Content
"""
opening = get_opening_section(content)
self.assertIn("## Table of Contents", opening)
self.assertIn("- [Title](#title)", opening)
def test_extract_section_by_slug(self):
"""Test extracting a section using its slug."""
content = """# Title
## API Conventions
Content about API conventions
## Other Section
Other content
"""
section = extract_section(content, "api-conventions")
self.assertIn("## API Conventions", section)
self.assertIn("Content about API conventions", section)
self.assertNotIn("Other Section", section)
def test_extract_section_by_slug_with_special_chars(self):
"""Test extracting a section with special characters using slug."""
content = """# Title
## Working with LLMs
Content about LLMs
## Next Section
Next content
"""
section = extract_section(content, "working-with-llms")
self.assertIn("## Working with LLMs", section)
self.assertIn("Content about LLMs", section)
def test_extract_section_exact_title_with_quotes(self):
"""Test extracting a section using exact quoted title."""
content = """# Title
## Quoted Title
Content here
## Another Section
More content
"""
section = extract_section(content, "Quoted Title")
self.assertIn("## Quoted Title", section)
self.assertIn("Content here", section)
def test_find_matching_sections_multiple_matches(self):
"""Test finding multiple sections with same slug."""
content = """# Title
## My Section
First content
## Another Header
Middle content
## My Section
Second content
"""
matches = find_matching_sections(content, "my-section")
self.assertEqual(len(matches), 2)
self.assertEqual(matches[0][2], "My Section")
self.assertEqual(matches[1][2], "My Section")
def test_extract_section_ambiguous_exits(self):
"""Test that ambiguous section name causes exit with line numbers."""
content = """# Title
## My Section
First content
## My Section
Second content
"""
# Capture stderr to verify line numbers are included
import io
from unittest.mock import patch
with patch('sys.stderr', new_callable=io.StringIO) as mock_stderr:
with self.assertRaises(SystemExit) as cm:
extract_section(content, "my-section")
self.assertEqual(cm.exception.code, 1)
error_output = mock_stderr.getvalue()
# Verify line numbers are present in the error message
self.assertIn("lines", error_output)
self.assertIn("My Section", error_output)
def test_extract_section_slug_vs_title_same_match(self):
"""Test that slug and title matching the same section works."""
content = """# Title
## Hello World
Content here
"""
# Both should match the same section, no ambiguity
section_by_title = extract_section(content, "Hello World")
self.assertIn("## Hello World", section_by_title)
section_by_slug = extract_section(content, "hello-world")
self.assertIn("## Hello World", section_by_slug)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment