#!/usr/bin/env python
r"""
Find duplicate files.
Usage: find-dups [--canonical=<prefix>] [--format=<fmt>] {dir}
Any number of directories may be given.
If --canonical is not given, list all duplicate files found in those
directories, indexed by their md5 hashes.
If --canonical is given, it is a prefix; list all duplicate files
that do not have that prefix.
If --format is given, apply that format string to output filenames.
For example,
find-dups foo --canonical=foo/bar --format="rm \"%s\""
...will produce a script to remove the duplicate files found somewhere
in foo other than in foo/bar.
"""
import hashlib
import os
import sys
from optparse import OptionParser
def md5(filename):
"""Compute and return the MD5 hash of the named file.
"""
hash = hashlib.md5()
file = open(filename, "r")
eof = False
while not eof:
data = file.read(1024)
if data:
hash.update(data)
else:
eof = True
file.close()
return hash.hexdigest()
def build_sizemap(directory, sizemap, exclude=None):
for root, dirs, files in os.walk(directory):
if os.path.normpath(root) == exclude:
print "(skipping %s)" % exclude
dirs[:] = []
else:
for filename in files:
full = os.path.normpath(os.path.join(root, filename))
if os.path.islink(full):
continue
try:
size = os.path.getsize(full)
sizemap.setdefault(size, []).append(full)
except IOError as e:
print str(e)
### MAIN ###
def main(argv):
parser = OptionParser()
parser.add_option("--canonical",
dest="canonical",
default=None,
help="list all duplicate files found "
"that do not have this prefix")
parser.add_option("--format",
dest="format",
default="%s",
help="format string for output filenames")
(options, args) = parser.parse_args()
sizemap = {}
for directory in args:
print "traversing %s..." % directory
build_sizemap(directory, sizemap)
hashmap = {}
for size, filenames in sizemap.iteritems():
if len(filenames) < 2:
continue
for filename in filenames:
hashmap.setdefault(md5(filename), []).append(filename)
if options.canonical is None:
for hash in hashmap:
filenames = sorted(hashmap[hash])
if len(filenames) > 1:
for filename in filenames:
print "%s\t%s" % (hash, filename)
else:
print
for hash in hashmap:
filenames = sorted(hashmap[hash])
if len(filenames) > 1:
canonicals = []
for filename in filenames:
if filename.startswith(options.canonical):
canonicals.append(filename)
if len(canonicals) == 1:
for filename in filenames:
if not filename.startswith(options.canonical):
print options.format % filename
if __name__ == '__main__':
main(sys.argv)