Skip to content

Instantly share code, notes, and snippets.

@jjshoots
Last active September 5, 2024 03:44
Show Gist options
  • Select an option

  • Save jjshoots/e8d2683ec2d2134028ff5e268521dad6 to your computer and use it in GitHub Desktop.

Select an option

Save jjshoots/e8d2683ec2d2134028ff5e268521dad6 to your computer and use it in GitHub Desktop.
Python Duplicate File Finder
import time
import argparse
import glob
import hashlib
import os
# Type aliases
FULLPATH = str
FILENAME = str
FILEHASH = str
def hash_file(filename: FULLPATH) -> FILEHASH:
"""Returns the MD5 hash of the file."""
with open(filename, "rb") as f:
return hashlib.md5(f.read()).hexdigest()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Duplicate Finder",
description="Shows all duplicate files in the directory.",
)
parser.add_argument("-v", "--verbose", default=False, action="store_true")
parser.add_argument(
"-d", "--directory", default="/home/jet/HDD_sdb/files/jet/1. Photos & Videos/"
)
args = parser.parse_args()
# whether to print a lot of junk
path_start_idx = 0 if args.verbose else len(os.getcwd())
# check the directory
directory = os.path.abspath(args.directory)
assert os.path.isdir(directory), f"{directory} is not a valid directory."
# start the program
print("---------------------------------------------------\n")
print("Starting duplicate finder...\n")
if args.verbose:
print(f"Target directory: {directory}\n")
print("---------------------------------------------------\n")
print("The way this program works is by comparing all filenames against each other.\n")
print("If it finds two files with identical filenames, it first compares their sizes.\n")
print("If the filesizes of two files are the same, it then compares their MD5 checksums.\n")
print("It will never perform an MD5 checksum comparison on files with two different names.\n")
print("Therefore, identical files with two different names will never get flagged.\n")
print("---------------------------------------------------\n")
# some runtime variables
num_files = 0
start_time = time.time()
known_fullpaths: dict[FILENAME, FULLPATH] = dict()
known_filehashes: dict[FILENAME, FILEHASH] = dict()
duplicates: list[tuple[FULLPATH, FULLPATH]] = []
duplicate_bytes = 0
# start running through every possible file
for path in glob.iglob(f"{directory}/**", recursive=True):
num_files += 1
# ignore directories
if os.path.isdir(path):
continue
# get the filename and fullpath
filename = os.path.basename(path)
fullpath = os.path.abspath(path)
# if the filename is unique, add it to our list of known files
if filename not in known_fullpaths:
known_fullpaths[filename] = fullpath
continue
# if same filename, check the filesize
if os.stat(known_fullpaths[filename]).st_size != os.stat(fullpath).st_size:
continue
# if same filename, same filesize, check the hash
if filename not in known_filehashes:
known_filehashes[filename] = hash_file(known_fullpaths[filename])
if known_filehashes[filename] != hash_file(fullpath):
continue
# if we reach here, we:
# - have the filename somewhere else
# - filename has same size
# - have a hash that's similar to another file
# therefore, it's a duplicate
duplicates.append((known_fullpaths[filename], fullpath))
duplicate_bytes += os.stat(fullpath).st_size
if args.verbose:
print(f"Dupe found for {known_fullpaths[filename]}.")
print("---------------")
# write the files
if len(duplicates) > 0:
print("\n" if len(duplicates) == 0 else "")
print(f"Looked at {num_files} files in {(time.time() - start_time):.3f} seconds.")
print(f"Found {len(duplicates)} duplicate{'s' if len(duplicates) != 1 else ''}.")
print(f"Total disk space of duplicates: {duplicate_bytes / 1024 / 1024 / 1024:.9f} Gigabytes.")
print(f"Writing duplicates to `duplicates.txt`...")
max_string_len = max(len(o[path_start_idx:]) for o, _ in duplicates)
writefile = os.path.join(os.getcwd(), "duplicates.txt")
with open(writefile, "w") as f:
for (original, duplicate) in duplicates:
# save the path string but only selectively include the start
f_original = original[path_start_idx:]
f_duplicate = duplicate[path_start_idx:]
# dynamic spacing
f.write(f_original)
f.write(" " * (max_string_len - len(f_original) + 4))
f.write(f_duplicate)
f.write("\n")
print(f"Done!")
else:
print("\nNo duplicates found!\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment