aboutsummaryrefslogtreecommitdiff
path: root/dupekill
blob: 79d5c0486b966c46b0963d95a72933b035647f48 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
import os
import hashlib
import sys
import stat
from optparse import OptionParser

# dupekill - deletes duplicates of existing data
# version 1.2
# written by zlg <zlg@zlg.space>
#        and NF <radicalmori@gmail.com>
#
#                     licensed under the...
#
#            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
#                    Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar
#  14 rue de Plaisance, 75014 Paris, France
# Everyone is permitted to copy and distribute verbatim or modified
# copies of this license document, and changing it is allowed as long
# as the name is changed.
#
#            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
#   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
#
#  0. You just DO WHAT THE FUCK YOU WANT TO.
# 
# This program is free software. It comes without any warranty, to
# the extent permitted by applicable law.
#
# You have been warned. >:3

# This function determines whether or not the file needs to be added
# to the list of files in the data list.
def need_to_add(filepath, datalist):
    found = 0
    if len(datalist) == 0:
        return True

    for entry in datalist:
        if entry[0] == filepath:
            found = 1

    return found != 1

def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):

    if not os.path.isdir(path):
        print("Error: Unable to fetch directory to work with.")
        sys.exit(1)

    # Create the generator, create the hash list and the counters.
    file_list = os.walk(path)
    hashList = []
    processed_files = 0
    deleted_files = 0

    for root, dirs, file in file_list:
        ignore_dirs = ['.git', '.config']
        for dir in ignore_dirs:
            if recursive == True:
                # Recursion still needs to ignore certain dirs
                if dir in dirs:
                    dirs.remove(dir)

            else:
                # We don't need _any_ dirs if recursion's off!
                while dirs:
                    dirs.pop()

        for item in file:
            # Set up a few variables we'll be needing.
            filepath = os.path.join(root, item)
            filesize = os.stat(filepath).st_size
            deleted = False # We need this flag to determine state before adding to the list

            # Funny, processed_files will always equal the index of the file
            # in our list. :D We might not need it, though
            processed_files += 1

            if len(hashList) > 0:
                for entry in hashList:

                    # ---- CHECK FILE NAME ----
                    # We'll get a false positive if we process a file against itself.
                    if filepath == entry[0]:
                        continue

                    # ---- CHECK FILESIZE ----
                    if filesize != entry[1]:
                        continue

                    # File sizes match, so let's check the first 512 bytes
                    filepreview = open(filepath, "rb").read(512)

                    # Check the length of the entry, and only add information if it's not already there
                    if len(entry) == 2:
                        entry.append(open(entry[0], "rb").read(512))

                    # ---- CHECK FILE PREVIEW ----
                    if len(entry) == 3 and entry[2] == filepreview:

                        # If they match and the files are less than 512 bytes... we don't need to hash!
                        if entry[1] < 512 and filesize < 512:
                            # We KNOW there's a match now.
                            deleted_files += 1
                            deleted = True

                            if not dry_run:
                                os.remove(filepath)

                            if verbose:
                                print("DUPE:", filepath)
                                break
                        else:
                            # Open the file in binary mode, to avoid UTF-8 errors.
                            filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()

                            if len(entry) == 3:
                                entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())

                    else:
                        # The previews don't match, so they're different
                        continue

                    # ---- CHECK FILE HASH ----
                    if len(entry) == 4 and entry[3] == filehash:
                        # We KNOW there's a match now.
                        deleted_files += 1
                        deleted = True

                        if not dry_run:
                            os.remove(filepath)

                        if verbose:
                            print("DUPE:", filepath)
                            break

            if need_to_add(filepath, hashList) and not deleted:
                hashList.append([filepath, filesize])

                if verbose:
                    print("FILE:", filepath)

    print()

    if dry_run:
        print("DRY RUN ON. NO FILES WILL BE DELETED.")

    print(processed_files, "files processed,", deleted_files, "deleted.\n")


if __name__ == '__main__':
    try:
        usage = "Usage: %prog [options] {path}"
        parser = OptionParser(usage=usage)
        parser.add_option("-d", "--dry", dest='dry_run', action='store_true', default=False, help="displays a list of files dupekill will delete if you run it again without this flag")
        parser.add_option("-r", "--recursive", dest='recursive', action='store_true', default=False, help="Recurses into all directories below the starting point")
        parser.add_option("-v", "--verbose", dest='verbose', action='store_true', default=False, help="Provide more detailed output")
        (options, args) = parser.parse_args()
        if args and os.path.isdir(args[0]):
            path = os.path.abspath(args[0])
        else:
            path = os.getcwd()
        dupekill(options.dry_run, options.recursive, options.verbose, path)
    except KeyboardInterrupt:
        print("Aborted")