#!/usr/bin/env python
"""
Find unique files.
Usage: find-unique src-dir {dest-dir}
List all files in src-dir which are not to be found anywhere (under any name)
in any of the dest-dirs.
src-dir may be somewhere inside one of the dest-dirs. src-dir will not be
traversed during dest-dir traveral. (really? doesn't always seem to work)
"""
import hashlib
import os
import re
import sys
from optparse import OptionParser
def md5(filename):
"""Compute and return the MD5 hash of the named file.
"""
hash = hashlib.md5()
file = open(filename, "r")
eof = False
while not eof:
data = file.read(1024)
if data:
hash.update(data)
else:
eof = True
file.close()
return hash.hexdigest()
def build_sizemap(directory, sizemap, exclude=None):
for root, dirs, files in os.walk(directory):
if os.path.normpath(root) == exclude:
print "(skipping %s)" % exclude
dirs[:] = []
else:
for filename in files:
full = os.path.normpath(os.path.join(root, filename))
if os.path.islink(full):
continue
try:
size = os.path.getsize(full)
sizemap.setdefault(size, []).append(full)
except IOError as e:
print str(e)
### MAIN ###
def main(argv):
parser = OptionParser()
parser.add_option("--move-to",
dest="move_to",
default=None,
help="move unique files to this directory "
"(DOES NOT CHECK THAT THE FILE BASENAMES "
"ARE UNIQUE)")
(options, args) = parser.parse_args()
dir_a = os.path.normpath(args[0])
print "traversing %s..." % dir_a
a = {}
build_sizemap(dir_a, a)
b = {}
for arg in args[1:]:
dest = os.path.normpath(arg)
if os.path.isdir(dest):
print "traversing %s..." % dest
build_sizemap(dest, b, exclude=dir_a)
def process(filename):
print filename
if options.move_to is not None:
basename = os.path.basename(filename)
destname = os.path.join(options.move_to, basename)
while os.path.exists(destname):
destname += '1'
print "renaming to %s..." % destname
os.rename(filename, destname)
for size in a:
if size not in b:
for filename in a[size]:
process(filename)
else:
a_hash = {}
for filename in a[size]:
a_hash.setdefault(md5(filename), []).append(filename)
b_hash = {}
for filename in b[size]:
b_hash.setdefault(md5(filename), []).append(filename)
for hash in a_hash:
if hash not in b_hash:
for filename in a_hash[hash]:
process(filename)
if __name__ == '__main__':
main(sys.argv)