From 1718a215035ced44e0acd75c1296e79d2b59e4df Mon Sep 17 00:00:00 2001 From: zlg Date: Mon, 25 Jun 2018 22:35:22 -0700 Subject: Skip over device and character nodes, sockets Verbose and all_files will now output the source of the dupe or clash. "Clashes" happen when a symlink is added to the list of scanned files and a similar file (or the file it points to) registers as a dupe. The symlink will be removed from the list and the filesystem (assuming -d and -i aren't specified). --- TODO | 2 +- dupekill | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/TODO b/TODO index cbbeb7c..8b13789 100644 --- a/TODO +++ b/TODO @@ -1 +1 @@ -* Find a way to indicate which file a dupe is a copy of. + diff --git a/dupekill b/dupekill index e8521ac..1d63b28 100755 --- a/dupekill +++ b/dupekill @@ -6,7 +6,7 @@ import stat from optparse import OptionParser # dupekill - deletes duplicates of existing data -# Version 1.4 (2012-04-02) +# Version 1.6 (2012-04-02) # Written by zlg # Original idea and code by NF # License: WTFPL @@ -58,10 +58,25 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign # Set up a few variables we'll be needing. filepath = os.path.join(root, item) + # We need this to determine the file's type + filemode = os.lstat(filepath).st_mode + + # Skip past file types we shouldn't deal with: sockets, device nodes, etc + if stat.S_ISSOCK(filemode) or stat.S_ISCHR(filemode) or stat.S_ISBLK(filemode) or stat.S_ISFIFO(filemode): + continue + + # Make sure links are handled properly. If they're gonna be ignored, + # there's no reason to continue the loop. + if stat.S_ISLNK(filemode): + if not ignore_links: + if all_files: + print("LINK:", filepath) + continue + # Check for information. If we can't fetch any, there's likely # a dead symlink or something else wrong. try: - filesize = os.lstat(filepath).st_size + filesize = os.path.getsize(filepath) # This occurs when the data points to something that can't be found or # resolved. except IOError: @@ -75,11 +90,19 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign print("DEAD LINK:", filepath) continue - if ignore_links: - if os.lstat(filepath).st_mode == 41471: - continue + # No sense in adding an empty file to the list + if filesize is 0: + continue + + # We need this flag to determine state before adding to the list + deleted = False - deleted = False # We need this flag to determine state before adding to the list + # A clash happens when a symlink has been added to the list and is being + # checked against a file. In this case, the symlink should be deleted + # since the file contains the actual data. + clash = False + + # Check to see if we can read or write to the file. if not os.access(filepath, os.R_OK): if verbose or all_files: print("NOTICE: Cannot read from", filepath) @@ -116,17 +139,32 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign # ---- CHECK FILE PREVIEW ---- if len(entry) == 3 and entry[2] == filepreview: - # If they match and the files are less than 512 bytes... we don't need to hash! - if entry[1] < 512 and filesize < 512: + # If they match and the files are less than 513 bytes... we don't need to hash! + if entry[1] < 513 and filesize < 513: # We KNOW there's a match now. deleted_files += 1 deleted = True + # Identify the right file to get rid of + if stat.S_ISLNK(os.lstat(entry[0]).st_mode): + clash = True + + # It's important to make sure the proper file is being deleted if not dry_run: - os.remove(filepath) + # If the hashlist has a symlink, get rid of it + if not ignore_links and clash: + os.remove(entry[0]) + else: + os.remove(filepath) if verbose or all_files: - print("DUPE:", filepath) + if not ignore_links and clash: + print("CLASH:", entry[0]) + print(" with", filepath) + del entry + else: + print("DUPE:", filepath) + print(" of", entry[0]) break else: # Open the file in binary mode, to avoid UTF-8 errors. @@ -145,11 +183,26 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign deleted_files += 1 deleted = True + # Identify the right file to get rid of + if stat.S_ISLNK(os.lstat(entry[0]).st_mode): + clash = True + + # It's important to make sure the proper file is being deleted if not dry_run: - os.remove(filepath) + # If the hashlist has a symlink, get rid of it + if not ignore_links and clash: + os.remove(entry[0]) + else: + os.remove(filepath) if verbose or all_files: - print("DUPE:", filepath) + if not ignore_links and clash: + print("CLASH:", entry[0]) + print(" with", filepath) + del entry + else: + print("DUPE:", filepath) + print(" of", entry[0]) break if need_to_add(filepath, hashList) and not deleted: -- cgit v1.2.3-54-g00ecf