Created
May 5, 2025 15:33
-
-
Save kenny-kvibe/83191ecbebdbc3a9b4c406a8fb7d6570 to your computer and use it in GitHub Desktop.
Search & print different files from 2 or more directories based on sha256 checksum
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import hashlib | |
| import json | |
| import os | |
| TARGET_DIRS_PATH = [] # <-- Find in these directories | |
| def walk_directory(dir_path: str, exclude_names: tuple[str, ...] | list[str] | None = None) -> list[str]: | |
| file_paths = [] | |
| for file_name in os.listdir(dir_path): | |
| file_path = os.path.join(dir_path, file_name) | |
| if os.path.exists(file_path) and (not exclude_names or file_name not in exclude_names): | |
| if os.path.isdir(file_path): | |
| file_paths.extend(walk_directory(file_path, exclude_names)) | |
| else: | |
| file_paths.append(file_path) | |
| return file_paths | |
| def file_checksum_sha256(file_path: str) -> str: | |
| checksum = '' | |
| if os.path.exists(file_path): | |
| hasher = hashlib.sha256() | |
| with open(file_path, 'rb') as f: | |
| while chunk := f.read(8192): | |
| hasher.update(chunk) | |
| checksum = hasher.hexdigest().upper() | |
| return checksum | |
| def find_diff_files(dirs_paths_ls: tuple[str, ...] | list[str], exclude_names: tuple[str, ...] | list[str] | None = None): | |
| if len(dirs_paths_ls) < 2: | |
| raise ValueError('Atleast 2 directory paths are needed to find the differences') | |
| files = {dir_path: {} for dir_path in dirs_paths_ls} | |
| for dir_path in dirs_paths_ls: | |
| files[dir_path].update(dict(map( | |
| lambda file: (file.split(dir_path, 1)[-1].strip('\\'), file_checksum_sha256(file)), | |
| walk_directory(dir_path, exclude_names) | |
| ))) | |
| print(f'Equal named files = {all(tuple(files[dir_path]) == tuple(files[other_dir_path]) for dir_path in files for other_dir_path in files if dir_path != other_dir_path)}') | |
| for dir_path in files: | |
| print(f'"{dir_path}" count = {len(files[dir_path])}') | |
| diffs = {dir_path: [] for dir_path in dirs_paths_ls} | |
| for dir_path in dirs_paths_ls: | |
| for file in files[dir_path]: | |
| for other_dir_path in dirs_paths_ls: | |
| if dir_path != other_dir_path: | |
| if file in files[other_dir_path] and files[dir_path][file] != files[other_dir_path][file]: | |
| diffs[dir_path].append(file) | |
| elif file not in files[other_dir_path]: | |
| diffs[dir_path].append(file) | |
| print(f'Diff files = {json.dumps(diffs, indent=2)}') | |
| return diffs | |
| def main() -> int: | |
| find_diff_files(TARGET_DIRS_PATH, ['__pycache__']) | |
| print('Done.') | |
| return 0 | |
| if __name__ == '__main__': | |
| raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment