aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzlg <zlg@zlg.space>2018-06-25 22:35:22 -0700
committerzlg <zlg@zlg.space>2018-06-25 22:35:22 -0700
commit1718a215035ced44e0acd75c1296e79d2b59e4df (patch)
tree2272eecf8ca91752e07fb67d18fedadec6a561be
parentReorganize project (diff)
downloaddupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.gz
dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.bz2
dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.tar.xz
dupekill-1718a215035ced44e0acd75c1296e79d2b59e4df.zip
Skip over device and character nodes, sockets
Verbose and all_files will now output the source of the dupe or clash. "Clashes" happen when a symlink is added to the list of scanned files and a similar file (or the file it points to) registers as a dupe. The symlink will be removed from the list and the filesystem (assuming -d and -i aren't specified).
Diffstat (limited to '')
-rw-r--r--TODO2
-rwxr-xr-xdupekill77
2 files changed, 66 insertions, 13 deletions
diff --git a/TODO b/TODO
index cbbeb7c..8b13789 100644
--- a/TODO
+++ b/TODO
@@ -1 +1 @@
-* Find a way to indicate which file a dupe is a copy of.
+
diff --git a/dupekill b/dupekill
index e8521ac..1d63b28 100755
--- a/dupekill
+++ b/dupekill
@@ -6,7 +6,7 @@ import stat
from optparse import OptionParser
# dupekill - deletes duplicates of existing data
-# Version 1.4 (2012-04-02)
+# Version 1.6 (2012-04-02)
# Written by zlg <zlg@zlg.space>
# Original idea and code by NF <radicalmori@gmail.com>
# License: WTFPL <http://sam.zoy.org/wtfpl>
@@ -58,10 +58,25 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign
# Set up a few variables we'll be needing.
filepath = os.path.join(root, item)
+ # We need this to determine the file's type
+ filemode = os.lstat(filepath).st_mode
+
+ # Skip past file types we shouldn't deal with: sockets, device nodes, etc
+ if stat.S_ISSOCK(filemode) or stat.S_ISCHR(filemode) or stat.S_ISBLK(filemode) or stat.S_ISFIFO(filemode):
+ continue
+
+ # Make sure links are handled properly. If they're gonna be ignored,
+ # there's no reason to continue the loop.
+ if stat.S_ISLNK(filemode):
+ if not ignore_links:
+ if all_files:
+ print("LINK:", filepath)
+ continue
+
# Check for information. If we can't fetch any, there's likely
# a dead symlink or something else wrong.
try:
- filesize = os.lstat(filepath).st_size
+ filesize = os.path.getsize(filepath)
# This occurs when the data points to something that can't be found or
# resolved.
except IOError:
@@ -75,11 +90,19 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign
print("DEAD LINK:", filepath)
continue
- if ignore_links:
- if os.lstat(filepath).st_mode == 41471:
- continue
+ # No sense in adding an empty file to the list
+ if filesize is 0:
+ continue
+
+ # We need this flag to determine state before adding to the list
+ deleted = False
- deleted = False # We need this flag to determine state before adding to the list
+ # A clash happens when a symlink has been added to the list and is being
+ # checked against a file. In this case, the symlink should be deleted
+ # since the file contains the actual data.
+ clash = False
+
+ # Check to see if we can read or write to the file.
if not os.access(filepath, os.R_OK):
if verbose or all_files:
print("NOTICE: Cannot read from", filepath)
@@ -116,17 +139,32 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign
# ---- CHECK FILE PREVIEW ----
if len(entry) == 3 and entry[2] == filepreview:
- # If they match and the files are less than 512 bytes... we don't need to hash!
- if entry[1] < 512 and filesize < 512:
+ # If they match and the files are less than 513 bytes... we don't need to hash!
+ if entry[1] < 513 and filesize < 513:
# We KNOW there's a match now.
deleted_files += 1
deleted = True
+ # Identify the right file to get rid of
+ if stat.S_ISLNK(os.lstat(entry[0]).st_mode):
+ clash = True
+
+ # It's important to make sure the proper file is being deleted
if not dry_run:
- os.remove(filepath)
+ # If the hashlist has a symlink, get rid of it
+ if not ignore_links and clash:
+ os.remove(entry[0])
+ else:
+ os.remove(filepath)
if verbose or all_files:
- print("DUPE:", filepath)
+ if not ignore_links and clash:
+ print("CLASH:", entry[0])
+ print(" with", filepath)
+ del entry
+ else:
+ print("DUPE:", filepath)
+ print(" of", entry[0])
break
else:
# Open the file in binary mode, to avoid UTF-8 errors.
@@ -145,11 +183,26 @@ def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ign
deleted_files += 1
deleted = True
+ # Identify the right file to get rid of
+ if stat.S_ISLNK(os.lstat(entry[0]).st_mode):
+ clash = True
+
+ # It's important to make sure the proper file is being deleted
if not dry_run:
- os.remove(filepath)
+ # If the hashlist has a symlink, get rid of it
+ if not ignore_links and clash:
+ os.remove(entry[0])
+ else:
+ os.remove(filepath)
if verbose or all_files:
- print("DUPE:", filepath)
+ if not ignore_links and clash:
+ print("CLASH:", entry[0])
+ print(" with", filepath)
+ del entry
+ else:
+ print("DUPE:", filepath)
+ print(" of", entry[0])
break
if need_to_add(filepath, hashList) and not deleted: