Created
February 14, 2019 21:09
-
-
Save Nicarim/9ee1dd62199f035986e51d590d207fd6 to your computer and use it in GitHub Desktop.
Finds a recursive difference between folder "TARGET_DIR" and "SOURCE_DIR" assuming that "SOURCE_DIR" is larger folder, and copies all files not found in "TARGET_DIR" to "DIFF_DIR_COPY"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import glob | |
| import hashlib | |
| import os | |
| import pickle | |
| import shutil | |
| from os.path import basename, join | |
| def get_sha1_from_path(path): | |
| BUF_SIZE = 200000 # lets read stuff in 64kb chunks! | |
| sha1 = hashlib.sha1() | |
| with open(path, 'rb') as f: | |
| while True: | |
| data = f.read(BUF_SIZE) | |
| if not data: | |
| break | |
| sha1.update(data) | |
| return str(sha1.hexdigest()) | |
| def get_hashes_path(path): | |
| _dict = {} | |
| files_len = len(path) | |
| for i, file in enumerate(path): | |
| if os.path.isdir(file): | |
| continue | |
| if not basename(file).split(".")[0]: | |
| continue | |
| _dict[file] = get_sha1_from_path(file) | |
| print(f"{i}/{files_len}") | |
| return _dict | |
| TARGET_DIR = r"H:\FolderOne" | |
| DIFF_DIR_COPY = r"H:\OutputFolder" | |
| SOURCE_DIR = r"J:\FolderTwo" | |
| files_in_target = glob.glob(TARGET_DIR + r"\**\*", recursive=True) | |
| files_in_source = glob.glob(SOURCE_DIR + r"\**\*", recursive=True) | |
| files_in_target_len = len(files_in_target) | |
| files_in_source_len = len(files_in_source) | |
| if os.path.exists('target_pickle'): | |
| with open('target_pickle', 'rb') as f: | |
| TARGET_DIR_FILES_HASHED = pickle.load(f) | |
| else: | |
| TARGET_DIR_FILES_HASHED = get_hashes_path(files_in_target) | |
| with open('target_pickle', 'wb+') as f: | |
| pickle.dump(TARGET_DIR_FILES_HASHED, f) | |
| if os.path.exists('source_pickle'): | |
| with open('source_pickle', 'rb') as f: | |
| SOURCE_DIR_FILES_HASHED = pickle.load(f) | |
| else: | |
| SOURCE_DIR_FILES_HASHED = get_hashes_path(files_in_source) | |
| with open('source_pickle', 'wb+') as f: | |
| pickle.dump(SOURCE_DIR_FILES_HASHED, f) | |
| to_copy = len(SOURCE_DIR_FILES_HASHED.keys()) - len(TARGET_DIR_FILES_HASHED.keys()) | |
| copied = 0 | |
| for key, val in SOURCE_DIR_FILES_HASHED.items(): | |
| if val not in TARGET_DIR_FILES_HASHED.values(): | |
| split_base = basename(key).split(".") | |
| diff_name = f"{split_base[0] + val[:3]}.{''.join(split_base[1:])}" | |
| shutil.copy(key, join(DIFF_DIR_COPY, diff_name)) | |
| copied += 1 | |
| print(f"Copied {copied} / {to_copy}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment