For an explanation, see the directory above.
Highlighting shows changes from the previous version.
duplicate-files.rb
Summary of Changes
- Un-break pretty printing.
- Add option to remove all zero-length files.
- Silence progress bar when in
--quiet
mode.
Usage
Usage: duplicate-files.rb [options] dir1 [dir2 ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-d, --from [DIR] Compare arguments only to files from DIR
-o, --output [OUTFILE] Write standard output to OUTFILE
-j, --json Write output as JSON
-y, --yaml Write output as YAML (default)
-p, --[no-]pretty Pretty-print output
-P, --perf-data [DIR] Write performace data to a directory (default /tmp)
-z, --[no-]zero Include zero-length files
Listing
1#!/usr/bin/env ruby
2
3require 'find'
4require 'fileutils'
5require 'optparse'
6require 'rational'
7require 'tempfile'
8require 'yaml'
9require 'json'
10
11
12# Default list of files to prune in search
13PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
14
15# Unfortunate artifact of transition between Ruby 1.9 and 2.0
16if self.class.const_defined?(:Encoding) then
17 ENCODING_UTF8 = Encoding.find('UTF-8')
18else
19 ENCODING_UTF8 = nil
20end
21
22# Simple class to manage an ASCII spinner
23class Spinner
24 SPINNER_STATES = ['-', '\\', '|', "/"]
25
26 UPDATE_INTERVAL = 0.5 # s
27
28 def initialize(io=$stderr)
29 @io = io
30 @state = 0
31 @updated = nil
32 end
33
34 def start
35 @updated = Time.now
36 @io.print(SPINNER_STATES[0])
37 end
38
39 def update
40 if not @updated then
41 start
42 end
43
44 now = Time.now
45 if now - @updated > UPDATE_INTERVAL then
46 @updated = now
47 @state = (@state + 1) % SPINNER_STATES.length
48 @io.print("\b", SPINNER_STATES[@state])
49 end
50 end
51end
52
53#
54# Not-so-simple class to manage a progress bar
55#
56class Progress
57 BAR_LENGTH = 40
58
59 UPDATE_INTERVAL = 1.0 # s
60
61 attr_accessor :actual, :estimate, :io, :quiet
62
63 def initialize
64 @io = io
65 @updated = nil
66 @estimate = 0
67 @actual = 0
68 @quiet = quiet
69 end
70
71 def do_estimate(results, canon)
72 @estimate = 0
73 results.each do |fileset|
74 # choose(x, k) is the number of ways one can choose `k`
75 # items from a set of `n`.
76 size = fileset.size
77 size += 1 if canon
78 if size > 1 then
79 @estimate += size * (size - 1)
80 # @estimate += choose(size, 2)
81 # @estimate += factorial(size)
82 end
83 end
84 update
85 end
86
87 def factorial(n)
88 if n <= 1 then 1 else n * factorial(n-1) end
89 end
90
91 def choose(n, k)
92 if n < 1 or k < 0 or k > n then
93 return 0
94 end
95 result = 1
96 1.upto(k) do |i|
97 result = result * (n + 1 - i) / i
98 end
99 return result
100 end
101
102 def add(amt = 1)
103 @actual += amt
104 update
105 end
106
107 def progress(increments = 100)
108 if @estimate == 0 then
109 return 0
110 else
111 return Rational(increments * @actual, @estimate).floor.to_i
112 end
113 end
114
115 def progress_bar
116 str = String.new
117 1.upto(BAR_LENGTH) do |i|
118 if i <= progress(BAR_LENGTH) then
119 str << '#'
120 else
121 str << '-'
122 end
123 end
124 return str
125 end
126
127 def update
128 if not @updated then
129 @updated = Time.now
130 end
131
132 now = Time.now
133 if now - @updated > UPDATE_INTERVAL then
134 @updated = now
135 if not quiet then
136 msg = @actual.to_s + "/" + @estimate.to_s + \
137 " |" + progress_bar + "|"
138 @io.print("\b"*msg.size, msg)
139 end
140 end
141 end
142end
143
144# Whether `path_i` and `path_j` refer to duplicate but not identical files
145def duplicate_files?(path_i, path_j)
146 return (File.exist?(path_i) \
147 and File.exist?(path_j) \
148 and not File.identical?(path_i, path_j) \
149 and FileUtils.cmp(path_i, path_j))
150end
151
152def prunable?(path, prune=[])
153 name = File.basename(path)
154 return (prune.include?(name) or File.fnmatch('._*', name))
155end
156
157# Recurse through array of `dirs` and produce Hash of all files by size.
158def files_by_size(dirs, prune=[], info=false)
159 result = Hash.new {|h,k| h[k]=[]}
160 count = 0
161 spinner = nil
162 if info then
163 $stderr.print("Looking for files in ", dirs, ": ")
164 spinner = Spinner.new($stderr)
165 spinner.start
166 end
167 Find.find(*dirs) do |path|
168 if prunable?(path, prune) then
169 Find.prune
170 elsif File.file?(path) then
171 size = File.size(path)
172 count = count + 1
173 path.encode!(ENCODING_UTF8) if ENCODING_UTF8
174 result[size] << path
175 if info then
176 count = count + 1
177 spinner.update
178 end
179 end
180 end
181 if info then
182 $stderr.print(" done!\n")
183
184 $stderr.print("Found ", count, " files in ",
185 result.size, " size groups.\n")
186 end
187 return result
188end
189
190# Compare each file in `paths` and append lists of equal files to `result`
191def append_duplicates(result, paths, progress)
192 idsets = Hash.new {|h,k| h[k] = { k => true } }
193 0.upto(paths.length - 1) do |i|
194 (i+1).upto(paths.length - 1) do |j|
195 path_i = paths[i]
196 path_j = paths[j]
197 if duplicate_files?(path_i, path_j) then
198 idsets[path_i][path_j] = true
199 idsets[path_j][path_i] = true
200 end
201 progress.add
202 end
203 end
204 idsets.values.uniq.each do | s |
205 result << s.keys.sort
206 end
207 return result
208end
209
210# Find all files in `fmap` and append lists of equal files to `result`
211# `fmap` is a map of files by size.
212def append_all_dupes(result, fmap, opts, progress)
213 fmap.each_pair do |size, paths|
214 if size > 1 and paths.size > 1 then
215 $stderr.print("Comparing ", paths, " ...") if opts.verbose
216 append_duplicates(result, paths, progress)
217 $stderr.print("done.\n") if opts.verbose
218 end
219 end
220end
221
222# Find all files in `srcdir`, compare to all files in `fmap`,
223# and append lists of equal files to `result`.
224# The path from `srcdir` will always be first in the list.
225def append_dir_dupes(result, fmap, opts, progress)
226 # Use names from the pre-rewrite version
227 srcdir = opts.canondir
228 prune = opts.prune
229 verbose = opts.verbose
230
231 $stderr.print("Looking for files in '", srcdir, "':\n") if verbose
232 Find.find(srcdir) do |path_i|
233 if prunable?(path_i, prune) then
234 Find.prune
235 elsif File.file?(path_i) then
236 size = File.size(path_i)
237 paths = fmap[size]
238 if size > 0 and paths and not paths.empty? then
239 dupes = []
240 $stderr.print("Comparing '", path_i,
241 "' to ", paths, " ...") if verbose
242 paths.each do |path_j|
243 if duplicate_files?(path_i, path_j) then
244 dupes << path_j
245 end
246 progress.add
247 end
248 if dupes.size > 0 then
249 dupes.sort!
250 result << [path_i, *dupes]
251 end
252 $stderr.print(" done.\n") if verbose
253 end
254 end
255 end
256 $stderr.print("Done!\n") if verbose
257 return result
258end
259
260Options = Struct.new("Options",
261 :info, :verbose, :canondir, :dirs, :prune,
262 :io, :format, :pretty, :perfdir,
263 :zero)
264
265def parse_options
266
267 config = Options.new(true, false, nil, [], PRUNE,
268 nil, :yaml, false, nil, false)
269
270 outfile = nil
271
272 OptionParser.new do |opts|
273 opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
274
275 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
276 config.info = (not v)
277 end
278 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
279 config.verbose = v
280 end
281 opts.on("-d", "--from [DIR]",
282 "Compare arguments only to files from DIR") do |d|
283 config.canondir = d
284 end
285 opts.on("-o", "--output [OUTFILE]",
286 "Write standard output to OUTFILE") do |f|
287 outfile = File.new(f, 'w')
288 end
289 opts.on("-j", "--json", "Write output as JSON") do |v|
290 config.format = :json
291 end
292 opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
293 config.format = :yaml
294 end
295 opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
296 config.pretty = v
297 end
298 opts.on("-P", "--perf-data [DIR]",
299 "Write performace data to a directory (default /tmp)") do |v|
300 config.perfdir = v
301 end
302 opts.on("-z", "--[no-]zero", "Include zero-length files") do |v|
303 config.zero = v
304 end
305 end.parse!
306
307 config.io = (outfile or $stdout)
308
309 config.dirs = ARGV
310
311 return config
312end
313
314def log_perf_data(perfdir, info, data)
315 begin
316 name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
317 path = nil
318 if not perfdir then
319 dir = Dir.mktmpdir('dfperf')
320 path = File.join(dir, name)
321 else
322 Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
323 dir = Dir.new(perfdir)
324 path = File.join(dir.to_path, name)
325 end
326 # TODO: check that file doesn't already exist.
327 File.open(path, 'w') do |file|
328 file.write(JSON.pretty_generate(data))
329 end
330 if info then
331 $stderr.print("Wrote performance data to ", path, "\n") if info
332 end
333 rescue Exception => msg
334 $stderr.print("Cannot log performance data: ", msg, "\n") if info
335 end
336end
337
338def run
339 opts = parse_options
340
341 time_start = Time.now
342
343 fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
344
345 time_search = Time.now
346
347 if opts.info then
348 $stderr.print("Comparing files ...\n")
349 end
350
351 progress = Progress.new
352 progress.io=$stderr
353 progress.quiet=(not opts.info)
354
355 time_before_est = Time.now
356
357 progress.do_estimate(fmap, opts.canondir != nil)
358
359 time_estimate = Time.now
360
361 results = []
362 if opts.canondir then
363 append_dir_dupes(results, fmap, opts, progress)
364 else
365 append_all_dupes(results, fmap, opts, progress)
366 end
367
368 time_compare = Time.now
369
370 if opts.zero and fmap[0] then
371 zerofiles = fmap[0].clone
372 zerofiles << ''
373 zerofiles.sort!
374 results << zerofiles
375 end
376
377 if opts.info then
378 $stderr.print("... done!\n")
379 end
380
381 log_perf_data(opts.perfdir, opts.info, {
382 "actual" => progress.actual,
383 "estimate" => progress.estimate,
384 "ngroups" => fmap.size,
385 "time-compare" => time_compare - time_estimate,
386 "time-estimate" => time_estimate - time_before_est,
387 "time-search" => time_search - time_start,
388 "data" => fmap})
389
390 if opts.format == :json then
391 if opts.pretty then
392 text = JSON.pretty_generate(results)
393 else
394 text = JSON.generate(results)
395 end
396 opts.io.write(text)
397 opts.io.write("\n")
398 else
399 # :line_width setting so file names w/spaces don't break across lines
400 yamlopts = {:line_width => 4096}
401 if opts.pretty then
402 # {:canonical => true} looks almost identical to pretty-printed
403 # JSON so the header keeps them distinct.
404 yamlopts[:header] = true
405 yamlopts[:canonical] = true
406 end
407 YAML.dump(results, opts.io, yamlopts)
408 end
409end
410
411run
remove-files.rb
Summary of Changes
- Add interactive mode to stop deletions at last minute.
Does NOT work if piping output from
duplicate-files.rb
into STDIN.
Usage
Usage: remove-files.rb [options] [listfile ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-j, --[no-]json Read input as JSON
-y, --[no-]yaml Read input as YAML (default)
-x, --[no-]mixed Deduce format from files
-i, --[no-]interactive Wait for user approval before deleting.
-u, --[no-]dry-run Don't delete, just list files
Listing
1#!/usr/bin/env ruby
2
3require 'optparse'
4require 'yaml'
5require 'psych'
6require 'json'
7
8# Append all but the first paths from each set in `filesets` to results
9def append_files(result, filesets, verbose=true)
10 filesets.each do |fset|
11 delset = fset[1..-1]
12 # Skip the first file in each set
13 if verbose then
14 $stderr.print("Skipping ", fset[0].inspect, ";\n");
15 $stderr.print("Adding ", delset, "\n")
16 end
17 delset.each do |n|
18 result << n
19 end
20 end
21end
22
23# Figure out if the `text` in `path` is JSON
24def is_json(path, text)
25 text.strip! # Bad form to modify an argument
26 path.end_with?(".json") or text.start_with?("{", "[", "\"")
27end
28
29def read_file(files, path, verbose)
30 $stderr.print("Reading ", path, " ...\n") if verbose
31 text = f.read
32 begin
33 if is_json(path, text) then
34 $stderr.print("Assuming JSON: ", path) if verbose
35 append_files(files, JSON.parse(text), verbose)
36 else
37 $stderr.print("Assuming YAML: ", path) if verbose
38 append_files(files, YAML.safe_load(text), verbose)
39 end
40 rescue JSON::JsonError => msg
41 $stderr.print(msg, "\n") if verbose
42 $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
43 rescue Psych::Exception => msg
44 $stderr.print(msg, "\n") if verbose
45 $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
46 end
47end
48
49#
50# Run the main loop
51#
52def run
53
54 info = true
55 verbose = false
56 input = :yaml
57 dry = false
58 interactive = false
59 files = []
60
61 OptionParser.new do |opts|
62 opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
63
64 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
65 info = (not v)
66 end
67 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
68 verbose = v
69 end
70 opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
71 input = :json if v
72 end
73 opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
74 input = :yaml if v
75 end
76 opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
77 input = nil if v
78 end
79 opts.on("-i", "--[no-]interactive",
80 "Wait for user approval before deleting.") do |v|
81 interactive = v
82 end
83 opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
84 dry = v
85 end
86 end.parse!
87
88 begin
89 if input == :json then
90 # Assuming we're getting a single JSON object ...
91 result = JSON.parse(ARGF.read)
92 append_files(files, result, verbose)
93 elsif input == :yaml or ARGV.empty? then
94 append_files(files, YAML.safe_load(ARGF.read), verbose)
95 else
96 ARGV.each { |path| read_file(files, path, verbose) }
97 end
98 rescue Psych::Exception => msg
99 $stderr.print(msg, "\n") if verbose
100 $stdout.print("Input isn't valid YAML; skipping.\n") if info
101 rescue JSON::JsonError => msg
102 $stderr.print(msg, "\n") if verbose
103 $stdout.print("Input isn't valid JSON; skipping.\n") if info
104 end
105
106 files.uniq!
107 files.sort!
108
109 if info and not files.empty? then
110 if interactive then
111 $stdout.print("About to remove the following files:\n")
112 else
113 $stdout.print("Removing the following files:\n")
114 end
115 YAML.dump(files, $stdout)
116 end
117
118 if interactive and not files.empty? then
119 $stdout.print("Remove ", files.size, " files? [y/N]: ")
120 response = $stdin.gets
121 if not response then
122 $stdout.print("No response; are you piping to STDIN? Exiting.\n")
123 return -1
124 else
125 response.strip!
126 if not response.start_with?('Y', 'y') then
127 $stdout.print("Exiting.\n")
128 return 1
129 end
130 end
131 end
132
133 count = 0
134
135 files.each do |path|
136 $stderr.print("-> rm ", path.inspect, "\n") if verbose
137 begin
138 File.delete(path) if not dry
139 count = count + 1
140 rescue StandardError => msg
141 $stderr.print(msg, "\n") if verbose
142 $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
143 end
144 end
145
146 $stdout.print("Removed ", count, " files.") if info
147 $stdout.print(" (Not really.)") if info and dry
148 $stdout.print("\n") if info
149
150 return 0
151end
152
153run()