Skip to content

Instantly share code, notes, and snippets.

@craigds
Last active August 20, 2025 03:39
Show Gist options
  • Select an option

  • Save craigds/aa6fa10c7cc6f45d7b32c6ed1b99a15e to your computer and use it in GitHub Desktop.

Select an option

Save craigds/aa6fa10c7cc6f45d7b32c6ed1b99a15e to your computer and use it in GitHub Desktop.
testing large zip iteration with VSIReadDirRecursive
#!/usr/bin/env python3
"""Create a zip file with configurable dirs x files x 1000 null bytes each (compressed)."""
import zipfile
import argparse
def create_large_zip(output_file="large.zip", num_dirs=1000, files_per_dir=1000):
"""Create a zip with many files containing null bytes."""
# Create 1000 null bytes in memory once
null_data = b'\x00' * 1000
total_files = num_dirs * files_per_dir
print(f"Creating {total_files:,} files ({num_dirs} dirs x {files_per_dir} files/dir)...")
with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
for dir_num in range(num_dirs):
if dir_num % max(1, num_dirs // 10) == 0:
print(f"Creating directory {dir_num}/{num_dirs}...")
for file_num in range(files_per_dir):
# Create file path
file_path = f"dir_{dir_num:04d}/file_{file_num:04d}.dat"
# Write the null bytes to the zip
zf.writestr(file_path, null_data)
print(f"Created {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create a large zip file with many small files")
parser.add_argument("--dirs", type=int, default=1000, help="Number of directories (default: 1000)")
parser.add_argument("--files-per-dir", type=int, default=1000, help="Files per directory (default: 1000)")
parser.add_argument("--output", default="large.zip", help="Output zip file (default: large.zip)")
args = parser.parse_args()
create_large_zip(args.output, args.dirs, args.files_per_dir)
#!/usr/bin/env python3
"""
Test script to benchmark ZIP file iteration using GDAL's ReadDirRecursive.
This script tests the performance improvements from the optimized OpenDir
implementation for archive files.
"""
import sys
import time
import os
from collections import defaultdict
from osgeo import gdal
gdal.UseExceptions()
gdal.SetConfigOption("CPL_DEBUG", "ON")
def analyze_zip_performance(zip_path):
"""
Iterate through a ZIP file using ReadDirRecursive and measure performance
per directory level.
"""
if not os.path.exists(zip_path):
print(f"Error: ZIP file not found: {zip_path}")
return
vsi_zip_path = f"/vsizip/{zip_path}"
print(f"Analyzing ZIP file: {zip_path}")
print(f"VSI path: {vsi_zip_path}")
print()
# Test the root directory first
start_time = time.monotonic()
try:
files = gdal.ReadDirRecursive(vsi_zip_path)
except Exception as e:
print(f"Error reading ZIP file: {e}")
return
end_time = time.monotonic()
total_time = end_time - start_time
if files is None:
print("No files found or ZIP file is empty")
return
print(f"Total files found: {len(files)}")
print(f"Total time: {total_time:.3f} seconds")
print(f"Files per second: {len(files) / total_time:.1f}")
print()
# Analyze directory structure
directories = defaultdict(list)
for file_path in files:
if file_path.endswith("/"):
# This is a directory
depth = file_path.count("/")
directories[depth].append(file_path)
else:
# This is a file - add its parent directory
if "/" in file_path:
dir_path = "/".join(file_path.split("/")[:-1]) + "/"
depth = dir_path.count("/")
if dir_path not in directories[depth]:
directories[depth].append(dir_path)
else:
# File in root directory
if "" not in directories[0]:
directories[0].append("")
print("Directory structure analysis:")
print("Depth | Count | Examples")
print("------|-------|----------")
for depth in sorted(directories.keys()):
dirs_at_depth = directories[depth]
examples = dirs_at_depth[:3] # Show first 3 examples
example_str = ", ".join(examples)
if len(dirs_at_depth) > 3:
example_str += f" ... (+{len(dirs_at_depth) - 3} more)"
print(f"{depth:5d} | {len(dirs_at_depth):5d} | {example_str}")
print()
# Test performance on a few specific subdirectories
test_dirs = []
# Add root directory
test_dirs.append(vsi_zip_path)
# Add a few subdirectories from different depths
for depth in sorted(directories.keys()):
if depth > 0 and len(directories[depth]) > 0:
# Take the middle directory at this depth
subdir = directories[depth][len(directories[depth]) // 2].rstrip("/")
if subdir: # Make sure it's not empty
test_dirs.append(f"{vsi_zip_path}/{subdir}")
if len(test_dirs) >= 5: # Limit to 5 tests
break
print("Performance test on specific directories:")
print("Directory | Files | Time (s) | Files/sec")
print("----------|-------|----------|----------")
for test_dir in test_dirs:
display_dir = test_dir.replace(vsi_zip_path, "").lstrip("/") or "[root]"
start_time = time.monotonic()
try:
dir_files = gdal.ReadDirRecursive(test_dir)
except Exception as e:
print(f"{display_dir:10s} | ERROR: {e}")
continue
end_time = time.monotonic()
dir_time = end_time - start_time
file_count = len(dir_files) if dir_files else 0
files_per_sec = file_count / dir_time if dir_time > 0 else float("inf")
print(
f"{display_dir:10s} | {file_count:5d} | {dir_time:8.3f} | {files_per_sec:9.1f}"
)
def main():
if len(sys.argv) != 2:
print("Usage: python test_zip_performance.py <zipfile>")
print()
print("This script tests GDAL's ReadDirRecursive performance on ZIP files.")
print("It measures timing and provides analysis of the directory structure.")
sys.exit(1)
zip_path = sys.argv[1]
# Print GDAL version info
print(f"GDAL Version: {gdal.VersionInfo()}")
print()
analyze_zip_performance(zip_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment