#!/usr/bin/python # # Find directories with duplicate content. # import sys import os import stat import itertools import operator import subprocess # should we perform a "deep inspection" of the candidate directories? DEEP_INSPECT = False def find(parent_dir): finfo = _statall(parent_dir) subdirs = [f for f, s in finfo if stat.S_ISDIR(s.st_mode)] print 'CHECKING:', len(subdirs), 'subdirectories' all_info = [ ] for subdir in subdirs: path = os.path.join(parent_dir, subdir) # to find dup backups of DVDs video_ts = os.path.join(path, 'VIDEO_TS') if os.path.exists(video_ts): path = video_ts finfo = frozenset((f, s.st_size) for f, s in _statall(path) if stat.S_ISREG(s.st_mode)) all_info.append((subdir, finfo, path)) # Sort based on the subdir file info (names/sizes) all_info = sorted(all_info, key=lambda x: hash(x[1])) # Locate all subdirs that have the same finfo for finfo, group in itertools.groupby(all_info, operator.itemgetter(1)): group = list(group) candidates = [subdir for subdir, _, _ in group] if len(candidates) > 1: print 'Candidates:', ', '.join(repr(s) for s in candidates) if DEEP_INSPECT: path1 = group[0][2] files = [f for f, s in group[0][1]] for _, _, path2 in group[1:]: _deep_inspect(path1, path2, files) def _deep_inspect(path1, path2, files): print ' INSPECTING:', path1, path2 for f in files: print ' Comparing:', f, '...', sys.stdout.flush() rc = subprocess.call(['cmp', os.path.join(path1, f), os.path.join(path2, f)]) if rc == 0: print 'same' else: print 'DIFFERENT. (rc=%d)' % (rc,) return print ' ==> DIRECTORY CONTENTS MATCH' def _statall(dirname): files = filter(lambda f: not f.startswith('.'), os.listdir(dirname)) return [(f, os.lstat(os.path.join(dirname, f))) for f in files] if __name__ == '__main__': if len(sys.argv) > 1 and sys.argv[1] == '--deep': DEEP_INSPECT = True del sys.argv[1] if len(sys.argv) > 2: print 'ERROR: %s [PARENT_DIR]' % (os.path.basename(sys.argv[0]),) sys.exit(1) if len(sys.argv) == 2: parent_dir = sys.argv[1] else: parent_dir = '.' find(parent_dir)