Last active
September 5, 2024 03:44
-
-
Save jjshoots/e8d2683ec2d2134028ff5e268521dad6 to your computer and use it in GitHub Desktop.
Python Duplicate File Finder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import argparse | |
| import glob | |
| import hashlib | |
| import os | |
| # Type aliases | |
| FULLPATH = str | |
| FILENAME = str | |
| FILEHASH = str | |
| def hash_file(filename: FULLPATH) -> FILEHASH: | |
| """Returns the MD5 hash of the file.""" | |
| with open(filename, "rb") as f: | |
| return hashlib.md5(f.read()).hexdigest() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| prog="Duplicate Finder", | |
| description="Shows all duplicate files in the directory.", | |
| ) | |
| parser.add_argument("-v", "--verbose", default=False, action="store_true") | |
| parser.add_argument( | |
| "-d", "--directory", default="/home/jet/HDD_sdb/files/jet/1. Photos & Videos/" | |
| ) | |
| args = parser.parse_args() | |
| # whether to print a lot of junk | |
| path_start_idx = 0 if args.verbose else len(os.getcwd()) | |
| # check the directory | |
| directory = os.path.abspath(args.directory) | |
| assert os.path.isdir(directory), f"{directory} is not a valid directory." | |
| # start the program | |
| print("---------------------------------------------------\n") | |
| print("Starting duplicate finder...\n") | |
| if args.verbose: | |
| print(f"Target directory: {directory}\n") | |
| print("---------------------------------------------------\n") | |
| print("The way this program works is by comparing all filenames against each other.\n") | |
| print("If it finds two files with identical filenames, it first compares their sizes.\n") | |
| print("If the filesizes of two files are the same, it then compares their MD5 checksums.\n") | |
| print("It will never perform an MD5 checksum comparison on files with two different names.\n") | |
| print("Therefore, identical files with two different names will never get flagged.\n") | |
| print("---------------------------------------------------\n") | |
| # some runtime variables | |
| num_files = 0 | |
| start_time = time.time() | |
| known_fullpaths: dict[FILENAME, FULLPATH] = dict() | |
| known_filehashes: dict[FILENAME, FILEHASH] = dict() | |
| duplicates: list[tuple[FULLPATH, FULLPATH]] = [] | |
| duplicate_bytes = 0 | |
| # start running through every possible file | |
| for path in glob.iglob(f"{directory}/**", recursive=True): | |
| num_files += 1 | |
| # ignore directories | |
| if os.path.isdir(path): | |
| continue | |
| # get the filename and fullpath | |
| filename = os.path.basename(path) | |
| fullpath = os.path.abspath(path) | |
| # if the filename is unique, add it to our list of known files | |
| if filename not in known_fullpaths: | |
| known_fullpaths[filename] = fullpath | |
| continue | |
| # if same filename, check the filesize | |
| if os.stat(known_fullpaths[filename]).st_size != os.stat(fullpath).st_size: | |
| continue | |
| # if same filename, same filesize, check the hash | |
| if filename not in known_filehashes: | |
| known_filehashes[filename] = hash_file(known_fullpaths[filename]) | |
| if known_filehashes[filename] != hash_file(fullpath): | |
| continue | |
| # if we reach here, we: | |
| # - have the filename somewhere else | |
| # - filename has same size | |
| # - have a hash that's similar to another file | |
| # therefore, it's a duplicate | |
| duplicates.append((known_fullpaths[filename], fullpath)) | |
| duplicate_bytes += os.stat(fullpath).st_size | |
| if args.verbose: | |
| print(f"Dupe found for {known_fullpaths[filename]}.") | |
| print("---------------") | |
| # write the files | |
| if len(duplicates) > 0: | |
| print("\n" if len(duplicates) == 0 else "") | |
| print(f"Looked at {num_files} files in {(time.time() - start_time):.3f} seconds.") | |
| print(f"Found {len(duplicates)} duplicate{'s' if len(duplicates) != 1 else ''}.") | |
| print(f"Total disk space of duplicates: {duplicate_bytes / 1024 / 1024 / 1024:.9f} Gigabytes.") | |
| print(f"Writing duplicates to `duplicates.txt`...") | |
| max_string_len = max(len(o[path_start_idx:]) for o, _ in duplicates) | |
| writefile = os.path.join(os.getcwd(), "duplicates.txt") | |
| with open(writefile, "w") as f: | |
| for (original, duplicate) in duplicates: | |
| # save the path string but only selectively include the start | |
| f_original = original[path_start_idx:] | |
| f_duplicate = duplicate[path_start_idx:] | |
| # dynamic spacing | |
| f.write(f_original) | |
| f.write(" " * (max_string_len - len(f_original) + 4)) | |
| f.write(f_duplicate) | |
| f.write("\n") | |
| print(f"Done!") | |
| else: | |
| print("\nNo duplicates found!\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment