Take command-line option for canonical prefix, any number of dirs.
catseye
11 years ago
2 | 2 | """ |
3 | 3 | Find duplicate files. |
4 | 4 | |
5 | Usage: find-dups dir [canonical] | |
5 | Usage: find-dups [--canonical=prefix] {dir} | |
6 | 6 | |
7 | If a duplicate is found, and one of the copies is has the prefix | |
7 | If a duplicate is found, and exactly one of the copies has the prefix | |
8 | 8 | given by `canonical`, commands to remove all the other copies are output. |
9 | 9 | |
10 | 10 | """ |
12 | 12 | import hashlib |
13 | 13 | import os |
14 | 14 | import sys |
15 | from optparse import OptionParser | |
15 | 16 | |
16 | 17 | |
17 | 18 | def md5(filename): |
35 | 36 | |
36 | 37 | |
37 | 38 | def main(argv): |
39 | parser = OptionParser() | |
40 | parser.add_option("--canonical", | |
41 | dest="canonical", | |
42 | default=None, | |
43 | help="output commands to delete all duplicates " | |
44 | "that do not have this prefix") | |
45 | (options, args) = parser.parse_args() | |
46 | ||
38 | 47 | hashmap = {} |
39 | for root, dirs, files in os.walk(argv[1]): | |
40 | for filename in files: | |
41 | full = os.path.normpath(os.path.join(root, filename)) | |
42 | hash = md5(full) | |
43 | hashmap.setdefault(hash, []).append(full) | |
48 | for directory in args: | |
49 | for root, dirs, files in os.walk(directory): | |
50 | for filename in files: | |
51 | full = os.path.normpath(os.path.join(root, filename)) | |
52 | hash = md5(full) | |
53 | hashmap.setdefault(hash, []).append(full) | |
44 | 54 | |
45 | 55 | for hash in hashmap: |
46 | 56 | filenames = sorted(hashmap[hash]) |
49 | 59 | for filename in filenames: |
50 | 60 | print "# %s" % filename |
51 | 61 | |
52 | if len(argv) == 3: | |
53 | canonical = argv[2] | |
62 | if options.canonical is not None: | |
54 | 63 | |
55 | 64 | for hash in hashmap: |
56 | 65 | filenames = sorted(hashmap[hash]) |
57 | 66 | if len(filenames) > 1: |
58 | 67 | canonicals = [] |
59 | 68 | for filename in filenames: |
60 | if filename.startswith(canonical): | |
69 | if filename.startswith(options.canonical): | |
61 | 70 | canonicals.append(filename) |
62 | 71 | if len(canonicals) == 1: |
63 | 72 | print "# delete all except %s" % canonicals[0] |
64 | 73 | for filename in filenames: |
65 | if not filename.startswith(canonical): | |
66 | print "rm '%s'" % filename | |
74 | if not filename.startswith(options.canonical): | |
75 | print 'rm "%s"' % filename | |
67 | 76 | |
68 | 77 | |
69 | 78 | if __name__ == '__main__': |