duplicate-files.py - 2023-03-07

Posted: 2023-03-07
Word Count: 740
Tags: programming python python-code

Table of Contents

For an explanation, see the directory above.

Highlighting shows changes from the previous version

duplicate-files.py

Summary of Changes

Usage

usage: duplicate-files.py [-h] [-q, --quiet] [-v, --verbose]
                          [-o, --output FILE] [-j, --json] [-y, --yaml]
                          [-p, --pretty] [-z, --zero-length]
                          DIR [DIR ...]

Find duplicate files in one or more directories.

positional arguments:
  DIR                a directory to scan

options:
  -h, --help         show this help message and exit
  -q, --quiet        run without printing to standard output.
  -v, --verbose      print extra information to standard output.
  -o, --output FILE  write to the given file (default: standard output)
  -j, --json         write output as JSON
  -y, --yaml         write output as YAML (default)
  -p, --pretty       "pretty-print" the output
  -z, --zero-length  include zero-length files

Listing

  1#!/usr/bin/env python3
  2
  3from pathlib import Path
  4from itertools import combinations
  5
  6import argparse
  7import filecmp
  8import fnmatch
  9import json
 10import sys
 11
 12try:
 13    import simplejson as json
 14except ImportError:
 15    import json
 16
 17global yaml_enabled
 18
 19try:
 20    import yaml
 21except ImportError:
 22    yaml_enabled = False
 23else:
 24    yaml_enabled = True
 25
 26
 27STD_EXCLUDES = [
 28    # version control directories
 29    'CVS', 'CVSROOT', '.git', '.gitignore', '.svn',
 30    # macosx-specific hidden files
 31    '.DS_Store', '.Apple*', '._*',
 32    # trash directories
 33    'Trash', '.Trash*',
 34]
 35
 36def prunable(name):
 37    return any(fnmatch.fnmatch(name, pat) for pat in STD_EXCLUDES)
 38
 39def find_files(sizemap, name):
 40    if isinstance(name, Path):
 41        f = name
 42    else:
 43        f = Path(name)
 44
 45    if prunable(f.name): return
 46
 47    # TODO: Check we have permissions 
 48
 49    if f.is_symlink(): return
 50
 51    if f.is_dir():
 52        for child in f.iterdir():
 53            find_files(sizemap, child)
 54    elif f.is_file():
 55        sz = f.stat().st_size
 56        if sz in sizemap:
 57            sizemap[sz].add(f)
 58        else:
 59            sizemap[sz] = set((f,))
 60
 61def duplicate_files(file_i, file_j):
 62     return file_i != file_j \
 63        and not file_i.samefile(file_j) \
 64        and filecmp.cmp(file_i, file_j, shallow=False)
 65
 66def add_to_dupsets(dupsets, file_i, file_j):
 67    if not file_i in dupsets:
 68        dupsets[file_i] = frozenset((file_i,))
 69    if not file_j in dupsets:
 70        dupsets[file_j] = frozenset((file_j,))
 71    newset = dupsets[file_i] | dupsets[file_j]
 72    for f in newset:
 73        dupsets[f] = newset
 74
 75def compare_files(sizemap):
 76    dupsets = {}
 77    for sz, ls in sizemap.items():
 78        if sz == 0: continue
 79        for file_i, file_j in combinations(ls, 2):
 80            if duplicate_files(file_i, file_j):
 81                add_to_dupsets(dupsets, file_i, file_j)
 82
 83    superset = set(dupsets.values())
 84    result = [sorted(s) for s in superset]
 85    result.sort()
 86    return result
 87
 88def make_argparser():
 89    parser = argparse.ArgumentParser(
 90                    description='Find duplicate files in one or more directories.')
 91    parser.add_argument('dirs', 
 92                    metavar='DIR',
 93                    nargs='+',
 94                    help='a directory to scan')
 95    #parser.add_argument('-d, --from', 
 96    #                metavar='MAINDIR',
 97    #                dest='canondir',
 98    #                help='Compare other directories to this directory.')
 99    parser.add_argument('-q, --quiet',
100                    action='store_true',
101                    dest='quiet',
102                    help='run without printing to standard output.')
103    parser.add_argument('-v, --verbose',
104                    action='store_true',
105                    dest='verbose',
106                    help='print extra information to standard output.')
107    parser.add_argument('-o, --output',
108                    metavar='FILE',
109                    dest='outfile',
110                    help='write to the given file (default: standard output)')
111    if yaml_enabled: 
112        parser.add_argument('-j, --json',
113                        action='store_const',
114                        const='json',
115                        default='yaml',
116                        dest='format',
117                        help='write output as JSON')
118        parser.add_argument('-y, --yaml',
119                        action='store_const',
120                        const='yaml',
121                        default='yaml',
122                        dest='format',
123                        help='write output as YAML (default)')
124    parser.add_argument('-p, --pretty',
125                    action='store_true',
126                    dest='pretty',
127                    help='"pretty-print" the output')     
128    parser.add_argument('-z, --zero-length',
129                    action='store_true',
130                    dest='zero',
131                    help='include zero-length files')    
132    return parser
133
134def convert_results(result):
135    return [[str(p) for p in ls] for ls in result]
136
137def run():
138    sizemap = {}
139    parser = make_argparser()
140    args = parser.parse_args()
141
142    for dirname in args.dirs:
143        find_files(sizemap, dirname)
144
145    result = compare_files(sizemap)
146
147    if args.zero and 0 in sizemap:
148        zero = [str(p) for p in sizemap[0]]
149        zero.add('')  # so remove-files will remove *all* files
150        result.add(zero)
151
152    if not yaml_enabled or args.format == 'json':
153        if args.pretty:
154            indent = 2
155        else:
156            indent = None
157        out = \
158            json.dumps(result, sort_keys=True, indent=indent, cls=PathEncoder)
159    else:
160        out = \
161            yaml.dump(convert_results(result),
162                    width=4096, 
163                    explicit_start=True,
164                    explicit_end=args.pretty,
165                    canonical=args.pretty)
166
167    if not args.outfile:
168        print(out)
169    else:
170        outpath = Path(args.outfile)
171        outpath.write_text(out, encoding='utf-8')
172
173class PathEncoder(json.JSONEncoder):
174    def default(self, obj):
175        if isinstance(obj, Path):
176            return str(obj)
177        # Let the base class default method raise the TypeError
178        return json.JSONEncoder.default(self, obj)
179
180if __name__ == '__main__':
181    run()