jjshoots · September 5, 2024 03:44
diff --git a/find_duplicates.py b/find_duplicates.py
 import time
 import argparse
 import glob
 import hashlib
 import os

 # Type aliases
 FULLPATH = str
 FILENAME = str
 FILEHASH = str

 def hash_file(filename: FULLPATH) -> FILEHASH:
    """Returns the MD5 hash of the file."""
    with open(filename, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog="Duplicate Finder",
        description="Shows all duplicate files in the directory.",
    )
    parser.add_argument("-v", "--verbose", default=False, action="store_true")
    parser.add_argument(
        "-d", "--directory", default="/home/jet/HDD_sdb/files/jet/1. Photos & Videos/"
    )
    args = parser.parse_args()

    # whether to print a lot of junk
    path_start_idx = 0 if args.verbose else len(os.getcwd())

    # check the directory
    directory = os.path.abspath(args.directory)
    assert os.path.isdir(directory), f"{directory} is not a valid directory."

    # start the program
    print("---------------------------------------------------\n")
    print("Starting duplicate finder...\n")
    if args.verbose:
        print(f"Target directory: {directory}\n")
    print("---------------------------------------------------\n")
    print("The way this program works is by comparing all filenames against each other.\n")
    print("If it finds two files with identical filenames, it first compares their sizes.\n")
    print("If the filesizes of two files are the same, it then compares their MD5 checksums.\n")
    print("It will never perform an MD5 checksum comparison on files with two different names.\n")
    print("Therefore, identical files with two different names will never get flagged.\n")
    print("---------------------------------------------------\n")

    # some runtime variables
    num_files = 0
    start_time = time.time()
    known_fullpaths: dict[FILENAME, FULLPATH] = dict()
    known_filehashes: dict[FILENAME, FILEHASH] = dict()
    duplicates: list[tuple[FULLPATH, FULLPATH]] = []
    duplicate_bytes = 0

    # start running through every possible file
    for path in glob.iglob(f"{directory}/**", recursive=True):
        num_files += 1

        # ignore directories
        if os.path.isdir(path):
            continue

        # get the filename and fullpath
        filename = os.path.basename(path)
        fullpath = os.path.abspath(path)

        # if the filename is unique, add it to our list of known files
        if filename not in known_fullpaths:
            known_fullpaths[filename] = fullpath
            continue

        # if same filename, check the filesize
        if os.stat(known_fullpaths[filename]).st_size != os.stat(fullpath).st_size:
            continue

        # if same filename, same filesize, check the hash
        if filename not in known_filehashes:
            known_filehashes[filename] = hash_file(known_fullpaths[filename])
        if known_filehashes[filename] != hash_file(fullpath):
            continue

        # if we reach here, we:
        # - have the filename somewhere else
        # - filename has same size
        # - have a hash that's similar to another file
        # therefore, it's a duplicate
        duplicates.append((known_fullpaths[filename], fullpath))
        duplicate_bytes += os.stat(fullpath).st_size
        if args.verbose:
            print(f"Dupe found for {known_fullpaths[filename]}.")
            print("---------------")

    # write the files
    if len(duplicates) > 0:
        print("\n" if len(duplicates) == 0 else "")
        print(f"Looked at {num_files} files in {(time.time() - start_time):.3f} seconds.")
        print(f"Found {len(duplicates)} duplicate{'s' if len(duplicates) != 1 else ''}.")
        print(f"Total disk space of duplicates: {duplicate_bytes / 1024 / 1024 / 1024:.9f} Gigabytes.")
        print(f"Writing duplicates to `duplicates.txt`...")

        max_string_len = max(len(o[path_start_idx:]) for o, _ in duplicates)
        writefile = os.path.join(os.getcwd(), "duplicates.txt")
        with open(writefile, "w") as f:
            for (original, duplicate) in duplicates:
                # save the path string but only selectively include the start
                f_original = original[path_start_idx:]
                f_duplicate = duplicate[path_start_idx:]

                # dynamic spacing
                f.write(f_original)
                f.write(" " * (max_string_len - len(f_original) + 4))
                f.write(f_duplicate)
                f.write("\n")

        print(f"Done!")
    else:
        print("\nNo duplicates found!\n")
	import time
	import argparse
	import glob
	import hashlib
	import os

	# Type aliases
	FULLPATH = str
	FILENAME = str
	FILEHASH = str

	def hash_file(filename: FULLPATH) -> FILEHASH:
	"""Returns the MD5 hash of the file."""
	with open(filename, "rb") as f:
	return hashlib.md5(f.read()).hexdigest()

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	prog="Duplicate Finder",
	description="Shows all duplicate files in the directory.",
	)
	parser.add_argument("-v", "--verbose", default=False, action="store_true")
	parser.add_argument(
	"-d", "--directory", default="/home/jet/HDD_sdb/files/jet/1. Photos & Videos/"
	)
	args = parser.parse_args()

	# whether to print a lot of junk
	path_start_idx = 0 if args.verbose else len(os.getcwd())

	# check the directory
	directory = os.path.abspath(args.directory)
	assert os.path.isdir(directory), f"{directory} is not a valid directory."

	# start the program
	print("---------------------------------------------------\n")
	print("Starting duplicate finder...\n")
	if args.verbose:
	print(f"Target directory: {directory}\n")
	print("---------------------------------------------------\n")
	print("The way this program works is by comparing all filenames against each other.\n")
	print("If it finds two files with identical filenames, it first compares their sizes.\n")
	print("If the filesizes of two files are the same, it then compares their MD5 checksums.\n")
	print("It will never perform an MD5 checksum comparison on files with two different names.\n")
	print("Therefore, identical files with two different names will never get flagged.\n")
	print("---------------------------------------------------\n")

	# some runtime variables
	num_files = 0
	start_time = time.time()
	known_fullpaths: dict[FILENAME, FULLPATH] = dict()
	known_filehashes: dict[FILENAME, FILEHASH] = dict()
	duplicates: list[tuple[FULLPATH, FULLPATH]] = []
	duplicate_bytes = 0

	# start running through every possible file
	for path in glob.iglob(f"{directory}/**", recursive=True):
	num_files += 1

	# ignore directories
	if os.path.isdir(path):
	continue

	# get the filename and fullpath
	filename = os.path.basename(path)
	fullpath = os.path.abspath(path)

	# if the filename is unique, add it to our list of known files
	if filename not in known_fullpaths:
	known_fullpaths[filename] = fullpath
	continue

	# if same filename, check the filesize
	if os.stat(known_fullpaths[filename]).st_size != os.stat(fullpath).st_size:
	continue

	# if same filename, same filesize, check the hash
	if filename not in known_filehashes:
	known_filehashes[filename] = hash_file(known_fullpaths[filename])
	if known_filehashes[filename] != hash_file(fullpath):
	continue

	# if we reach here, we:
	# - have the filename somewhere else
	# - filename has same size
	# - have a hash that's similar to another file
	# therefore, it's a duplicate
	duplicates.append((known_fullpaths[filename], fullpath))
	duplicate_bytes += os.stat(fullpath).st_size
	if args.verbose:
	print(f"Dupe found for {known_fullpaths[filename]}.")
	print("---------------")

	# write the files
	if len(duplicates) > 0:
	print("\n" if len(duplicates) == 0 else "")
	print(f"Looked at {num_files} files in {(time.time() - start_time):.3f} seconds.")
	print(f"Found {len(duplicates)} duplicate{'s' if len(duplicates) != 1 else ''}.")
	print(f"Total disk space of duplicates: {duplicate_bytes / 1024 / 1024 / 1024:.9f} Gigabytes.")
	print(f"Writing duplicates to `duplicates.txt`...")

	max_string_len = max(len(o[path_start_idx:]) for o, _ in duplicates)
	writefile = os.path.join(os.getcwd(), "duplicates.txt")
	with open(writefile, "w") as f:
	for (original, duplicate) in duplicates:
	# save the path string but only selectively include the start
	f_original = original[path_start_idx:]
	f_duplicate = duplicate[path_start_idx:]

	# dynamic spacing
	f.write(f_original)
	f.write(" " * (max_string_len - len(f_original) + 4))
	f.write(f_duplicate)
	f.write("\n")

	print(f"Done!")
	else:
	print("\nNo duplicates found!\n")
No results found