Skip to content

Instantly share code, notes, and snippets.

@nstefan13
Created December 10, 2025 12:56
Show Gist options
  • Select an option

  • Save nstefan13/937d05c6c236d5428c775ef6051c7942 to your computer and use it in GitHub Desktop.

Select an option

Save nstefan13/937d05c6c236d5428c775ef6051c7942 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
create_hierarchy.py
Creates a directory hierarchy containing files, directories and symlinks
(both valid and intentionally broken). Useful for stress-testing recursive
traversal tools that check for broken links.
Usage:
python3 create_hierarchy.py /path/to/output_root [--dirs N] [--files N] [--symlinks N] [--broken-ratio R] [--seed S]
The script follows the user's requested design:
1) generate_in_memory_tree(...) - builds an in-memory representation of the
filesystem nodes and decides which symlinks will point to which targets
(including broken ones).
2) materialize_tree_on_disk(...) - creates directories, files and symlinks
on the real filesystem and records which links are good vs broken.
At the end the script prints a short summary and writes a JSON report to
"link_report.json" inside the created root directory.
"""
from __future__ import annotations
import argparse
import json
import os
import random
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple
@dataclass
class Node:
name: str
parent: Optional["DirNode"] = None
def path_components(self) -> List[str]:
parts = []
cur: Optional[Node] = self
while cur is not None and cur.parent is not None:
parts.append(cur.name)
cur = cur.parent
parts.reverse()
return parts
@dataclass
class FileNode(Node):
content: str = ""
@dataclass
class DirNode(Node):
children: List[Node] = field(default_factory=list)
def add_child(self, node: Node) -> None:
node.parent = self
self.children.append(node)
@dataclass
class SymlinkNode(Node):
# target can be either a Node (existing) or a string (for a broken target)
target_node: Optional[Node] = None
target_string: Optional[str] = None
def target_is_broken(self) -> bool:
return self.target_node is None and self.target_string is not None
def generate_in_memory_tree(
root_name: str = "root",
n_dirs: int = 10,
n_files: int = 30,
n_symlinks: int = 20,
broken_ratio: float = 0.3,
max_depth: int = 4,
seed: int = 42,
) -> Tuple[DirNode, List[SymlinkNode]]:
random.seed(seed)
root = DirNode(name=root_name, parent=None)
# We'll maintain a pool of directories where new nodes can be attached.
dir_pool: List[DirNode] = [root]
file_pool: List[FileNode] = []
# Create directories
for i in range(n_dirs):
# pick a parent that isn't too deep
attempts = 0
while True:
parent = random.choice(dir_pool)
depth = len(parent.path_components())
if depth < max_depth or attempts > 10:
break
attempts += 1
d = DirNode(name=f"dir_{i}")
parent.add_child(d)
dir_pool.append(d)
# Create files
for i in range(n_files):
parent = random.choice(dir_pool)
f = FileNode(name=f"file_{i}.txt", content=f"This is file {i}\n")
parent.add_child(f)
file_pool.append(f)
# Create symlinks (some good, some intentionally broken)
symlink_nodes: List[SymlinkNode] = []
all_targets: List[Node] = dir_pool + file_pool
for i in range(n_symlinks):
location = random.choice(dir_pool)
should_be_broken = random.random() < broken_ratio
linkname = f"link_{i}"
# allow links to directories and files; add suffix for files to avoid name clashes
if should_be_broken:
# produce a target string that will not exist when materialized
# e.g. ../nonexistent_xyz or absolute path outside the tree
if random.random() < 0.5:
# relative broken target
target_string = f"../nonexistent_target_{i}"
else:
# absolute broken target
target_string = f"/nonexistent/definitely_missing_{i}"
ln = SymlinkNode(name=linkname, target_node=None, target_string=target_string)
location.add_child(ln)
symlink_nodes.append(ln)
else:
# point to an existing node (file or dir)
target = random.choice(all_targets)
ln = SymlinkNode(name=linkname, target_node=target, target_string=None)
location.add_child(ln)
symlink_nodes.append(ln)
return root, symlink_nodes
def materialize_tree_on_disk(root_node: DirNode, root_path: Path, symlink_nodes: List[SymlinkNode]) -> Dict:
"""
Create the physical filesystem items for every node in root_node (except
symlinks). Then create symlinks described in symlink_nodes. Return a
dictionary report with counts and lists of links and their statuses.
"""
report = {
"created_dirs": [],
"created_files": [],
"symlinks": [], # list of dicts: {path, target, status}
}
# First pass: create directories and files (non-symlinks)
def _create_node(node: Node, parent_fs_path: Path):
if isinstance(node, DirNode):
fsdir = parent_fs_path / node.name
fsdir.mkdir(parents=True, exist_ok=True)
report["created_dirs"].append(str(fsdir))
for child in node.children:
# Skip symlinks for now
if isinstance(child, SymlinkNode):
continue
_create_node(child, fsdir)
elif isinstance(node, FileNode):
fsfile = parent_fs_path / node.name
fsfile.write_text(node.content)
report["created_files"].append(str(fsfile))
else:
# SymlinkNodes are handled in second pass
pass
# If root_node's name is the same as root_path.name, create inside it, else use root_path/root_node.name
if root_path.exists() and not root_path.is_dir():
raise RuntimeError(f"Target exists and is not a directory: {root_path}")
base = root_path
base.mkdir(parents=True, exist_ok=True)
# create immediate children of root_node inside base
for child in root_node.children:
if isinstance(child, SymlinkNode):
continue
_create_node(child, base)
# Second pass: create symlinks
good = 0
broken = 0
symlink_reports = []
for sl in symlink_nodes:
# compute filesystem path for location where symlink will live
link_parent_components = sl.parent.path_components() if sl.parent else []
link_parent_fs = base.joinpath(*link_parent_components)
link_path = link_parent_fs / sl.name
if sl.target_node is not None:
# compute the real filesystem path of the target
target_components = sl.target_node.path_components()
target_fs_path = base.joinpath(*target_components)
# compute a relative path from the link location to the target
try:
rel_target = os.path.relpath(str(target_fs_path), start=str(link_parent_fs))
except Exception:
rel_target = str(target_fs_path)
# create symlink
try:
# remove existing if any
if link_path.exists() or link_path.is_symlink():
link_path.unlink()
os.symlink(rel_target, str(link_path))
# check if symlink is good
if os.path.exists(str(link_path)):
status = "good"
good += 1
else:
status = "broken"
broken += 1
except OSError as e:
# On platforms where symlink creation is restricted, fall back by
# creating a small .symlink file to indicate intended target.
fallback_file = link_path.with_suffix(link_path.suffix + ".symlink")
fallback_file.write_text(str(rel_target))
status = f"symlink_failed_wrote_fallback: {e!r}"
broken += 1
symlink_reports.append({
"path": str(link_path),
"target": rel_target,
"status": status,
})
else:
# broken symlink as intended: write the target_string as the symlink target
target_str = sl.target_string or ""
try:
if link_path.exists() or link_path.is_symlink():
link_path.unlink()
os.symlink(target_str, str(link_path))
if os.path.exists(str(link_path)):
# surprising: the previously "broken" target actually exists
status = "good_surprising"
good += 1
else:
status = "broken"
broken += 1
except OSError as e:
fallback_file = link_path.with_suffix(link_path.suffix + ".symlink")
fallback_file.write_text(target_str)
status = f"symlink_failed_wrote_fallback: {e!r}"
broken += 1
symlink_reports.append({
"path": str(link_path),
"target": target_str,
"status": status,
})
report["symlinks"] = symlink_reports
report_summary = {
"symlinks_total": len(symlink_reports),
"symlinks_good": good,
"symlinks_broken": broken,
}
report["summary"] = report_summary
return report
def main(argv=None):
parser = argparse.ArgumentParser(description="Create a test directory tree with valid and broken symlinks.")
parser.add_argument("rootdir", help="Directory in which to create the hierarchy")
parser.add_argument("--dirs", type=int, default=10, help="Number of extra subdirectories to create")
parser.add_argument("--files", type=int, default=30, help="Number of files to create")
parser.add_argument("--symlinks", type=int, default=20, help="Number of symlinks to create")
parser.add_argument("--broken-ratio", type=float, default=0.3, help="Fraction of symlinks that should be broken (0.0-1.0)")
parser.add_argument("--seed", type=int, default=42, help="Random seed for deterministic trees")
args = parser.parse_args(argv)
rootdir = Path(args.rootdir).expanduser().resolve()
# Build in-memory tree
print(f"Generating in-memory tree (seed={args.seed})...")
root_node, symlink_nodes = generate_in_memory_tree(
root_name=rootdir.name,
n_dirs=args.dirs,
n_files=args.files,
n_symlinks=args.symlinks,
broken_ratio=args.broken_ratio,
seed=args.seed,
)
# Materialize on disk
print(f"Materializing tree at: {rootdir} ...")
report = materialize_tree_on_disk(root_node, rootdir, symlink_nodes)
# Write report
report_path = rootdir / "link_report.json"
report_path.write_text(json.dumps(report, indent=2))
print("\nDone. Summary:")
print(f" directories created: {len(report['created_dirs'])}")
print(f" files created: {len(report['created_files'])}")
print(f" symlinks total: {report['summary']['symlinks_total']}")
print(f" good: {report['summary']['symlinks_good']}")
print(f" broken: {report['summary']['symlinks_broken']}")
print(f"Report written to: {report_path}")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment