#!/usr/bin/env python3 import os import hashlib import sys import stat from optparse import OptionParser # dupekill - deletes duplicates of existing data # Version 1.6 (2012-04-02) # Written by zlg # Original idea and code by NF # License: WTFPL # This function determines whether or not the file needs to be added # to the list of files in the data list. def need_to_add(filepath, datalist): found = 0 if len(datalist) == 0: return True for entry in datalist: if entry[0] == filepath: found = 1 return found != 1 def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ignore_links=False, path=os.getcwd()): if all_files and verbose: print("Error: All operations (-a) or only important ones (-v), not both.") sys.exit(1) if not os.path.isdir(path): print("Error: Unable to fetch directory to work with.") sys.exit(1) # Create the generator, create the hash list and the counters. file_list = os.walk(path) hashList = [] processed_files = 0 deleted_files = 0 for root, dirs, file in file_list: ignore_dirs = ['.git', '.config'] for dir in ignore_dirs: if recursive == True: # Recursion still needs to ignore certain dirs if dir in dirs: dirs.remove(dir) else: # We don't need _any_ dirs if recursion's off! while dirs: dirs.pop() for item in file: # Set up a few variables we'll be needing. filepath = os.path.join(root, item) # We need this to determine the file's type filemode = os.lstat(filepath).st_mode # Skip past file types we shouldn't deal with: sockets, device nodes, etc if stat.S_ISSOCK(filemode) or stat.S_ISCHR(filemode) or stat.S_ISBLK(filemode) or stat.S_ISFIFO(filemode): continue # Make sure links are handled properly. If they're gonna be ignored, # there's no reason to continue the loop. if stat.S_ISLNK(filemode): if not ignore_links: if all_files: print("LINK:", filepath) continue # Check for information. If we can't fetch any, there's likely # a dead symlink or something else wrong. try: filesize = os.path.getsize(filepath) # This occurs when the data points to something that can't be found or # resolved. except IOError: if verbose or all_files: print("NOT FOUND:", filepath) continue # This occurs mostly with symlinks. except OSError: if verbose or all_files: print("DEAD LINK:", filepath) continue # No sense in adding an empty file to the list if filesize == 0: continue # We need this flag to determine state before adding to the list deleted = False # A clash happens when a symlink has been added to the list and is being # checked against a file. In this case, the symlink should be deleted # since the file contains the actual data. clash = False # Check to see if we can read or write to the file. if not os.access(filepath, os.R_OK): if verbose or all_files: print("NOTICE: Cannot read from", filepath) continue if not os.access(filepath, os.W_OK): if verbose or all_files: print("NOTICE: Cannot write to", filepath) # Funny, processed_files will always equal the index of the file # in our list. :D We might not need it, though processed_files += 1 if len(hashList) > 0: for entry in hashList: # ---- CHECK FILE NAME ---- # We'll get a false positive if we process a file against itself. if filepath == entry[0]: continue # ---- CHECK FILESIZE ---- if filesize != entry[1]: continue # File sizes match, so let's check the first 512 bytes filepreview = open(filepath, "rb").read(512) # Check the length of the entry, and only add information if it's not already there if len(entry) == 2: entry.append(open(entry[0], "rb").read(512)) # ---- CHECK FILE PREVIEW ---- if len(entry) == 3 and entry[2] == filepreview: # If they match and the files are less than 513 bytes... we don't need to hash! if entry[1] < 513 and filesize < 513: # We KNOW there's a match now. deleted_files += 1 deleted = True # Identify the right file to get rid of if stat.S_ISLNK(os.lstat(entry[0]).st_mode): clash = True # It's important to make sure the proper file is being deleted if not dry_run: # If the hashlist has a symlink, get rid of it if not ignore_links and clash: os.remove(entry[0]) else: os.remove(filepath) if verbose or all_files: if not ignore_links and clash: print("CLASH:", entry[0]) print(" with", filepath) del entry else: print("DUPE:", filepath) print(" of", entry[0]) break else: # Open the file in binary mode, to avoid UTF-8 errors. filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest() if len(entry) == 3: entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest()) else: # The previews don't match, so they're different continue # ---- CHECK FILE HASH ---- if len(entry) == 4 and entry[3] == filehash: # We KNOW there's a match now. deleted_files += 1 deleted = True # Identify the right file to get rid of if stat.S_ISLNK(os.lstat(entry[0]).st_mode): clash = True # It's important to make sure the proper file is being deleted if not dry_run: # If the hashlist has a symlink, get rid of it if not ignore_links and clash: os.remove(entry[0]) else: os.remove(filepath) if verbose or all_files: if not ignore_links and clash: print("CLASH:", entry[0]) print(" with", filepath) del entry else: print("DUPE:", filepath) print(" of", entry[0]) break if need_to_add(filepath, hashList) and not deleted: hashList.append([filepath, filesize]) if all_files: print("FILE:", filepath) print() if dry_run: print("DRY RUN ON. NO FILES WILL BE DELETED.") print(processed_files, "files processed,", deleted_files, "deleted.\n") if __name__ == '__main__': try: usage = "Usage: %prog [options] {path}" description = "Deletes files that have duplicate data in them" epilog = "dupekill likes to munch on files. A lot. By default, symlinks and hardlinks that point to the same file will be deleted. Be careful!" version = "%prog version 1.6 (2012-06-13)" parser = OptionParser(usage=usage, description=description, epilog=epilog, version=version) parser.add_option("-d", "--dry", dest='dry_run', action='store_true', default=False, help="don't delete any files") parser.add_option("-r", "--recursive", dest='recursive', action='store_true', default=False, help="recurse into all directories below the current directory") parser.add_option("-v", "--verbose", dest='verbose', action='store_true', default=False, help="provide more detailed output") parser.add_option("-a", "--all-files", dest='all_files', action='store_true', default=False, help="show all processed files, not just dupes and errors") parser.add_option("-i", "--ignore-links", dest="ignore_links", action='store_true', default=False, help="don't process symlinks") (options, args) = parser.parse_args() if args and os.path.isdir(args[0]): path = os.path.abspath(args[0]) else: path = os.getcwd() dupekill(options.dry_run, options.recursive, options.verbose, options.all_files, options.ignore_links, path) except KeyboardInterrupt: print("Aborted")