diff options
author | zlg <zlg@zlg.space> | 2018-06-25 22:35:22 -0700 |
---|---|---|
committer | zlg <zlg@zlg.space> | 2018-06-25 22:35:22 -0700 |
commit | 1718a215035ced44e0acd75c1296e79d2b59e4df (patch) | |
tree | 2272eecf8ca91752e07fb67d18fedadec6a561be | |
parent | Reorganize project (diff) | |
download | dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.gz dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.bz2 dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.xz dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.zip |
Skip over device and character nodes, sockets
Verbose and all_files will now output the source of the dupe or clash.
"Clashes" happen when a symlink is added to the list of scanned files and a
similar file (or the file it points to) registers as a dupe. The symlink will be
removed from the list and the filesystem (assuming -d and -i aren't specified).
Diffstat (limited to '')
-rw-r--r-- | TODO | 2 | ||||
-rwxr-xr-x | dupekill | 77 |
2 files changed, 66 insertions, 13 deletions
@@ -1 +1 @@ -* Find a way to indicate which file a dupe is a copy of. + @@ -6,7 +6,7 @@ import stat from optparse import OptionParser # dupekill - deletes duplicates of existing data -# Version 1.4 (2012-04-02) +# Version 1.6 (2012-04-02) # Written by zlg <zlg@zlg.space> # Original idea and code by NF <radicalmori@gmail.com> # License: WTFPL <http://sam.zoy.org/wtfpl> @@ -58,10 +58,25 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign # Set up a few variables we'll be needing. filepath = os.path.join(root, item) + # We need this to determine the file's type + filemode = os.lstat(filepath).st_mode + + # Skip past file types we shouldn't deal with: sockets, device nodes, etc + if stat.S_ISSOCK(filemode) or stat.S_ISCHR(filemode) or stat.S_ISBLK(filemode) or stat.S_ISFIFO(filemode): + continue + + # Make sure links are handled properly. If they're gonna be ignored, + # there's no reason to continue the loop. + if stat.S_ISLNK(filemode): + if not ignore_links: + if all_files: + print("LINK:", filepath) + continue + # Check for information. If we can't fetch any, there's likely # a dead symlink or something else wrong. try: - filesize = os.lstat(filepath).st_size + filesize = os.path.getsize(filepath) # This occurs when the data points to something that can't be found or # resolved. except IOError: @@ -75,11 +90,19 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign print("DEAD LINK:", filepath) continue - if ignore_links: - if os.lstat(filepath).st_mode == 41471: - continue + # No sense in adding an empty file to the list + if filesize is 0: + continue + + # We need this flag to determine state before adding to the list + deleted = False - deleted = False # We need this flag to determine state before adding to the list + # A clash happens when a symlink has been added to the list and is being + # checked against a file. In this case, the symlink should be deleted + # since the file contains the actual data. + clash = False + + # Check to see if we can read or write to the file. if not os.access(filepath, os.R_OK): if verbose or all_files: print("NOTICE: Cannot read from", filepath) @@ -116,17 +139,32 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign # ---- CHECK FILE PREVIEW ---- if len(entry) == 3 and entry[2] == filepreview: - # If they match and the files are less than 512 bytes... we don't need to hash! - if entry[1] < 512 and filesize < 512: + # If they match and the files are less than 513 bytes... we don't need to hash! + if entry[1] < 513 and filesize < 513: # We KNOW there's a match now. deleted_files += 1 deleted = True + # Identify the right file to get rid of + if stat.S_ISLNK(os.lstat(entry[0]).st_mode): + clash = True + + # It's important to make sure the proper file is being deleted if not dry_run: - os.remove(filepath) + # If the hashlist has a symlink, get rid of it + if not ignore_links and clash: + os.remove(entry[0]) + else: + os.remove(filepath) if verbose or all_files: - print("DUPE:", filepath) + if not ignore_links and clash: + print("CLASH:", entry[0]) + print(" with", filepath) + del entry + else: + print("DUPE:", filepath) + print(" of", entry[0]) break else: # Open the file in binary mode, to avoid UTF-8 errors. @@ -145,11 +183,26 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign deleted_files += 1 deleted = True + # Identify the right file to get rid of + if stat.S_ISLNK(os.lstat(entry[0]).st_mode): + clash = True + + # It's important to make sure the proper file is being deleted if not dry_run: - os.remove(filepath) + # If the hashlist has a symlink, get rid of it + if not ignore_links and clash: + os.remove(entry[0]) + else: + os.remove(filepath) if verbose or all_files: - print("DUPE:", filepath) + if not ignore_links and clash: + print("CLASH:", entry[0]) + print(" with", filepath) + del entry + else: + print("DUPE:", filepath) + print(" of", entry[0]) break if need_to_add(filepath, hashList) and not deleted: |