#!/usr/bin/env python3 import os import hashlib import sys import stat from optparse import OptionParser # dupekill - deletes duplicates of existing data # version 1.2 # written by zlg # and NF # # licensed under the... # # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # Version 2, December 2004 # # Copyright (C) 2004 Sam Hocevar # 14 rue de Plaisance, 75014 Paris, France # Everyone is permitted to copy and distribute verbatim or modified # copies of this license document, and changing it is allowed as long # as the name is changed. # # DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE # TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION # # 0. You just DO WHAT THE FUCK YOU WANT TO. # # This program is free software. It comes without any warranty, to # the extent permitted by applicable law. # # You have been warned. >:3 def dupekill(): usage = "Usage: %prog [options] {path}" parser = OptionParser(usage=usage) parser.add_option("-d", "--dry", dest='dry_run', action='store_true', default=False, help="displays a list of files dupekill will delete if you run it again without this flag") parser.add_option("-r", "--recursive", dest='recursive', action='store_true', default=False, help="Recurses into all directories below the starting point") parser.add_option("-v", "--verbose", dest='verbose', action='store_true', default=False, help="Provide more detailed output") (options, args) = parser.parse_args() if args and os.path.isdir(args[0]): path = os.path.abspath(args[0]) else: path = os.getcwd() if not os.path.isdir(path): print("Error: Unable to fetch directory to work with.") sys.exit(1) else: # Create the generator, create the hash list and the counters. file_list = os.walk(path) hashList = [] processed_files = 0 deleted_files = 0 for root, dirs, file in file_list: ignore_dirs = ['.git', '.config'] for dir in ignore_dirs: if options.recursive == True: # Recursion still needs to ignore certain dirs if dir in dirs: dirs.remove(dir) else: # While no recursion doesn't need _any_ dirs! while dirs: dirs.pop() for item in file: checkedFile = open(os.path.join(root, item), "rb").read() hash = hashlib.sha256(checkedFile).hexdigest() processed_files += 1 if len(hashList) > 0 and hash in hashList: # We want to count these, even if it's a dry run. deleted_files += 1 if not options.dry_run: os.remove(os.path.join(root, item)) if options.verbose: print("Dupe", os.path.join(root, item), "found.") else: hashList.append(hash) if options.verbose: print("New file", os.path.join(root, item)) # Print a summary print() if options.dry_run: print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!") print(processed_files, "files processed,", deleted_files, "deleted.\n") if __name__ == '__main__': try: dupekill() except KeyboardInterrupt: print("Aborted")