For an explanation, see the directory above.
Highlighting shows changes from the previous version
duplicate-files.py
Summary of Changes
- Add command-line options comparable to the Ruby version
- Redirect output to a file.
- Convert results to YAML or “pretty-printed” JSON.
- Optionally include zero-length files in output.
- Assorted optimizations and Pythonicization.
Usage
usage: duplicate-files.py [-h] [-q, --quiet] [-v, --verbose]
[-o, --output FILE] [-j, --json] [-y, --yaml]
[-p, --pretty] [-z, --zero-length]
DIR [DIR ...]
Find duplicate files in one or more directories.
positional arguments:
DIR a directory to scan
options:
-h, --help show this help message and exit
-q, --quiet run without printing to standard output.
-v, --verbose print extra information to standard output.
-o, --output FILE write to the given file (default: standard output)
-j, --json write output as JSON
-y, --yaml write output as YAML (default)
-p, --pretty "pretty-print" the output
-z, --zero-length include zero-length files
Listing
1#!/usr/bin/env python3
2
3from pathlib import Path
4from itertools import combinations
5
6import argparse
7import filecmp
8import fnmatch
9import json
10import sys
11
12try:
13 import simplejson as json
14except ImportError:
15 import json
16
17global yaml_enabled
18
19try:
20 import yaml
21except ImportError:
22 yaml_enabled = False
23else:
24 yaml_enabled = True
25
26
27STD_EXCLUDES = [
28 # version control directories
29 'CVS', 'CVSROOT', '.git', '.gitignore', '.svn',
30 # macosx-specific hidden files
31 '.DS_Store', '.Apple*', '._*',
32 # trash directories
33 'Trash', '.Trash*',
34]
35
36def prunable(name):
37 return any(fnmatch.fnmatch(name, pat) for pat in STD_EXCLUDES)
38
39def find_files(sizemap, name):
40 if isinstance(name, Path):
41 f = name
42 else:
43 f = Path(name)
44
45 if prunable(f.name): return
46
47 # TODO: Check we have permissions
48
49 if f.is_symlink(): return
50
51 if f.is_dir():
52 for child in f.iterdir():
53 find_files(sizemap, child)
54 elif f.is_file():
55 sz = f.stat().st_size
56 if sz in sizemap:
57 sizemap[sz].add(f)
58 else:
59 sizemap[sz] = set((f,))
60
61def duplicate_files(file_i, file_j):
62 return file_i != file_j \
63 and not file_i.samefile(file_j) \
64 and filecmp.cmp(file_i, file_j, shallow=False)
65
66def add_to_dupsets(dupsets, file_i, file_j):
67 if not file_i in dupsets:
68 dupsets[file_i] = frozenset((file_i,))
69 if not file_j in dupsets:
70 dupsets[file_j] = frozenset((file_j,))
71 newset = dupsets[file_i] | dupsets[file_j]
72 for f in newset:
73 dupsets[f] = newset
74
75def compare_files(sizemap):
76 dupsets = {}
77 for sz, ls in sizemap.items():
78 if sz == 0: continue
79 for file_i, file_j in combinations(ls, 2):
80 if duplicate_files(file_i, file_j):
81 add_to_dupsets(dupsets, file_i, file_j)
82
83 superset = set(dupsets.values())
84 result = [sorted(s) for s in superset]
85 result.sort()
86 return result
87
88def make_argparser():
89 parser = argparse.ArgumentParser(
90 description='Find duplicate files in one or more directories.')
91 parser.add_argument('dirs',
92 metavar='DIR',
93 nargs='+',
94 help='a directory to scan')
95 #parser.add_argument('-d, --from',
96 # metavar='MAINDIR',
97 # dest='canondir',
98 # help='Compare other directories to this directory.')
99 parser.add_argument('-q, --quiet',
100 action='store_true',
101 dest='quiet',
102 help='run without printing to standard output.')
103 parser.add_argument('-v, --verbose',
104 action='store_true',
105 dest='verbose',
106 help='print extra information to standard output.')
107 parser.add_argument('-o, --output',
108 metavar='FILE',
109 dest='outfile',
110 help='write to the given file (default: standard output)')
111 if yaml_enabled:
112 parser.add_argument('-j, --json',
113 action='store_const',
114 const='json',
115 default='yaml',
116 dest='format',
117 help='write output as JSON')
118 parser.add_argument('-y, --yaml',
119 action='store_const',
120 const='yaml',
121 default='yaml',
122 dest='format',
123 help='write output as YAML (default)')
124 parser.add_argument('-p, --pretty',
125 action='store_true',
126 dest='pretty',
127 help='"pretty-print" the output')
128 parser.add_argument('-z, --zero-length',
129 action='store_true',
130 dest='zero',
131 help='include zero-length files')
132 return parser
133
134def convert_results(result):
135 return [[str(p) for p in ls] for ls in result]
136
137def run():
138 sizemap = {}
139 parser = make_argparser()
140 args = parser.parse_args()
141
142 for dirname in args.dirs:
143 find_files(sizemap, dirname)
144
145 result = compare_files(sizemap)
146
147 if args.zero and 0 in sizemap:
148 zero = [str(p) for p in sizemap[0]]
149 zero.add('') # so remove-files will remove *all* files
150 result.add(zero)
151
152 if not yaml_enabled or args.format == 'json':
153 if args.pretty:
154 indent = 2
155 else:
156 indent = None
157 out = \
158 json.dumps(result, sort_keys=True, indent=indent, cls=PathEncoder)
159 else:
160 out = \
161 yaml.dump(convert_results(result),
162 width=4096,
163 explicit_start=True,
164 explicit_end=args.pretty,
165 canonical=args.pretty)
166
167 if not args.outfile:
168 print(out)
169 else:
170 outpath = Path(args.outfile)
171 outpath.write_text(out, encoding='utf-8')
172
173class PathEncoder(json.JSONEncoder):
174 def default(self, obj):
175 if isinstance(obj, Path):
176 return str(obj)
177 # Let the base class default method raise the TypeError
178 return json.JSONEncoder.default(self, obj)
179
180if __name__ == '__main__':
181 run()