Skip to content

Instantly share code, notes, and snippets.

@Nicarim
Created February 14, 2019 21:09
Show Gist options
  • Select an option

  • Save Nicarim/9ee1dd62199f035986e51d590d207fd6 to your computer and use it in GitHub Desktop.

Select an option

Save Nicarim/9ee1dd62199f035986e51d590d207fd6 to your computer and use it in GitHub Desktop.
Finds a recursive difference between folder "TARGET_DIR" and "SOURCE_DIR" assuming that "SOURCE_DIR" is larger folder, and copies all files not found in "TARGET_DIR" to "DIFF_DIR_COPY"
import glob
import hashlib
import os
import pickle
import shutil
from os.path import basename, join
def get_sha1_from_path(path):
BUF_SIZE = 200000 # lets read stuff in 64kb chunks!
sha1 = hashlib.sha1()
with open(path, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
sha1.update(data)
return str(sha1.hexdigest())
def get_hashes_path(path):
_dict = {}
files_len = len(path)
for i, file in enumerate(path):
if os.path.isdir(file):
continue
if not basename(file).split(".")[0]:
continue
_dict[file] = get_sha1_from_path(file)
print(f"{i}/{files_len}")
return _dict
TARGET_DIR = r"H:\FolderOne"
DIFF_DIR_COPY = r"H:\OutputFolder"
SOURCE_DIR = r"J:\FolderTwo"
files_in_target = glob.glob(TARGET_DIR + r"\**\*", recursive=True)
files_in_source = glob.glob(SOURCE_DIR + r"\**\*", recursive=True)
files_in_target_len = len(files_in_target)
files_in_source_len = len(files_in_source)
if os.path.exists('target_pickle'):
with open('target_pickle', 'rb') as f:
TARGET_DIR_FILES_HASHED = pickle.load(f)
else:
TARGET_DIR_FILES_HASHED = get_hashes_path(files_in_target)
with open('target_pickle', 'wb+') as f:
pickle.dump(TARGET_DIR_FILES_HASHED, f)
if os.path.exists('source_pickle'):
with open('source_pickle', 'rb') as f:
SOURCE_DIR_FILES_HASHED = pickle.load(f)
else:
SOURCE_DIR_FILES_HASHED = get_hashes_path(files_in_source)
with open('source_pickle', 'wb+') as f:
pickle.dump(SOURCE_DIR_FILES_HASHED, f)
to_copy = len(SOURCE_DIR_FILES_HASHED.keys()) - len(TARGET_DIR_FILES_HASHED.keys())
copied = 0
for key, val in SOURCE_DIR_FILES_HASHED.items():
if val not in TARGET_DIR_FILES_HASHED.values():
split_base = basename(key).split(".")
diff_name = f"{split_base[0] + val[:3]}.{''.join(split_base[1:])}"
shutil.copy(key, join(DIFF_DIR_COPY, diff_name))
copied += 1
print(f"Copied {copied} / {to_copy}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment