Last active
August 20, 2025 03:39
-
-
Save craigds/aa6fa10c7cc6f45d7b32c6ed1b99a15e to your computer and use it in GitHub Desktop.
testing large zip iteration with VSIReadDirRecursive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Create a zip file with configurable dirs x files x 1000 null bytes each (compressed).""" | |
| import zipfile | |
| import argparse | |
| def create_large_zip(output_file="large.zip", num_dirs=1000, files_per_dir=1000): | |
| """Create a zip with many files containing null bytes.""" | |
| # Create 1000 null bytes in memory once | |
| null_data = b'\x00' * 1000 | |
| total_files = num_dirs * files_per_dir | |
| print(f"Creating {total_files:,} files ({num_dirs} dirs x {files_per_dir} files/dir)...") | |
| with zipfile.ZipFile(output_file, 'w', compression=zipfile.ZIP_DEFLATED) as zf: | |
| for dir_num in range(num_dirs): | |
| if dir_num % max(1, num_dirs // 10) == 0: | |
| print(f"Creating directory {dir_num}/{num_dirs}...") | |
| for file_num in range(files_per_dir): | |
| # Create file path | |
| file_path = f"dir_{dir_num:04d}/file_{file_num:04d}.dat" | |
| # Write the null bytes to the zip | |
| zf.writestr(file_path, null_data) | |
| print(f"Created {output_file}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Create a large zip file with many small files") | |
| parser.add_argument("--dirs", type=int, default=1000, help="Number of directories (default: 1000)") | |
| parser.add_argument("--files-per-dir", type=int, default=1000, help="Files per directory (default: 1000)") | |
| parser.add_argument("--output", default="large.zip", help="Output zip file (default: large.zip)") | |
| args = parser.parse_args() | |
| create_large_zip(args.output, args.dirs, args.files_per_dir) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Test script to benchmark ZIP file iteration using GDAL's ReadDirRecursive. | |
| This script tests the performance improvements from the optimized OpenDir | |
| implementation for archive files. | |
| """ | |
| import sys | |
| import time | |
| import os | |
| from collections import defaultdict | |
| from osgeo import gdal | |
| gdal.UseExceptions() | |
| gdal.SetConfigOption("CPL_DEBUG", "ON") | |
| def analyze_zip_performance(zip_path): | |
| """ | |
| Iterate through a ZIP file using ReadDirRecursive and measure performance | |
| per directory level. | |
| """ | |
| if not os.path.exists(zip_path): | |
| print(f"Error: ZIP file not found: {zip_path}") | |
| return | |
| vsi_zip_path = f"/vsizip/{zip_path}" | |
| print(f"Analyzing ZIP file: {zip_path}") | |
| print(f"VSI path: {vsi_zip_path}") | |
| print() | |
| # Test the root directory first | |
| start_time = time.monotonic() | |
| try: | |
| files = gdal.ReadDirRecursive(vsi_zip_path) | |
| except Exception as e: | |
| print(f"Error reading ZIP file: {e}") | |
| return | |
| end_time = time.monotonic() | |
| total_time = end_time - start_time | |
| if files is None: | |
| print("No files found or ZIP file is empty") | |
| return | |
| print(f"Total files found: {len(files)}") | |
| print(f"Total time: {total_time:.3f} seconds") | |
| print(f"Files per second: {len(files) / total_time:.1f}") | |
| print() | |
| # Analyze directory structure | |
| directories = defaultdict(list) | |
| for file_path in files: | |
| if file_path.endswith("/"): | |
| # This is a directory | |
| depth = file_path.count("/") | |
| directories[depth].append(file_path) | |
| else: | |
| # This is a file - add its parent directory | |
| if "/" in file_path: | |
| dir_path = "/".join(file_path.split("/")[:-1]) + "/" | |
| depth = dir_path.count("/") | |
| if dir_path not in directories[depth]: | |
| directories[depth].append(dir_path) | |
| else: | |
| # File in root directory | |
| if "" not in directories[0]: | |
| directories[0].append("") | |
| print("Directory structure analysis:") | |
| print("Depth | Count | Examples") | |
| print("------|-------|----------") | |
| for depth in sorted(directories.keys()): | |
| dirs_at_depth = directories[depth] | |
| examples = dirs_at_depth[:3] # Show first 3 examples | |
| example_str = ", ".join(examples) | |
| if len(dirs_at_depth) > 3: | |
| example_str += f" ... (+{len(dirs_at_depth) - 3} more)" | |
| print(f"{depth:5d} | {len(dirs_at_depth):5d} | {example_str}") | |
| print() | |
| # Test performance on a few specific subdirectories | |
| test_dirs = [] | |
| # Add root directory | |
| test_dirs.append(vsi_zip_path) | |
| # Add a few subdirectories from different depths | |
| for depth in sorted(directories.keys()): | |
| if depth > 0 and len(directories[depth]) > 0: | |
| # Take the middle directory at this depth | |
| subdir = directories[depth][len(directories[depth]) // 2].rstrip("/") | |
| if subdir: # Make sure it's not empty | |
| test_dirs.append(f"{vsi_zip_path}/{subdir}") | |
| if len(test_dirs) >= 5: # Limit to 5 tests | |
| break | |
| print("Performance test on specific directories:") | |
| print("Directory | Files | Time (s) | Files/sec") | |
| print("----------|-------|----------|----------") | |
| for test_dir in test_dirs: | |
| display_dir = test_dir.replace(vsi_zip_path, "").lstrip("/") or "[root]" | |
| start_time = time.monotonic() | |
| try: | |
| dir_files = gdal.ReadDirRecursive(test_dir) | |
| except Exception as e: | |
| print(f"{display_dir:10s} | ERROR: {e}") | |
| continue | |
| end_time = time.monotonic() | |
| dir_time = end_time - start_time | |
| file_count = len(dir_files) if dir_files else 0 | |
| files_per_sec = file_count / dir_time if dir_time > 0 else float("inf") | |
| print( | |
| f"{display_dir:10s} | {file_count:5d} | {dir_time:8.3f} | {files_per_sec:9.1f}" | |
| ) | |
| def main(): | |
| if len(sys.argv) != 2: | |
| print("Usage: python test_zip_performance.py <zipfile>") | |
| print() | |
| print("This script tests GDAL's ReadDirRecursive performance on ZIP files.") | |
| print("It measures timing and provides analysis of the directory structure.") | |
| sys.exit(1) | |
| zip_path = sys.argv[1] | |
| # Print GDAL version info | |
| print(f"GDAL Version: {gdal.VersionInfo()}") | |
| print() | |
| analyze_zip_performance(zip_path) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment