duplicate-files.rb - 2023-02-25

Posted: 2023-02-25
Word Count: 2394
Tags: programming ruby

Table of Contents

For an explanation, see the directory above.

Highlighting shows changes from the previous version.

duplicate-files.rb

Summary of Changes

Usage

Usage: duplicate-files.rb [options] dir1 [dir2 ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -d, --from [DIR]                 Compare arguments only to files from DIR
    -o, --output [OUTFILE]           Write standard output to OUTFILE
    -j, --json                       Write output as JSON
    -y, --yaml                       Write output as YAML (default)
    -p, --[no-]pretty                Pretty-print output
    -P, --perf-data [DIR]            Write performace data to a directory (default /tmp)
    -z, --[no-]zero                  Include zero-length files

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'find'
  4require 'fileutils'
  5require 'optparse'
  6require 'rational'
  7require 'tempfile'
  8require 'yaml'
  9require 'json'
 10
 11
 12# Default list of files to prune in search
 13PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
 14
 15# Unfortunate artifact of transition between Ruby 1.9 and 2.0
 16if self.class.const_defined?(:Encoding) then
 17    ENCODING_UTF8 = Encoding.find('UTF-8')
 18else
 19    ENCODING_UTF8 = nil
 20end
 21
 22# Simple class to manage an ASCII spinner
 23class Spinner
 24    SPINNER_STATES = ['-', '\\', '|', "/"]
 25
 26    UPDATE_INTERVAL = 0.5 # s
 27
 28    def initialize(io=$stderr)
 29        @io = io
 30        @state = 0
 31        @updated = nil
 32    end
 33
 34    def start
 35        @updated = Time.now
 36        @io.print(SPINNER_STATES[0])
 37    end
 38
 39    def update
 40        if not @updated then
 41            start
 42        end
 43
 44        now = Time.now
 45        if now - @updated > UPDATE_INTERVAL then
 46            @updated = now
 47            @state = (@state + 1) % SPINNER_STATES.length
 48            @io.print("\b", SPINNER_STATES[@state])
 49        end
 50    end
 51end
 52
 53#
 54# Not-so-simple class to manage a progress bar
 55#
 56class Progress
 57    BAR_LENGTH = 40
 58
 59    UPDATE_INTERVAL = 1.0 # s
 60
 61    attr_accessor :actual, :estimate, :io, :quiet
 62
 63    def initialize
 64        @io = io
 65        @updated = nil
 66        @estimate = 0
 67        @actual = 0
 68        @quiet = quiet
 69    end
 70
 71    def do_estimate(results, canon)
 72        @estimate = 0
 73        results.each do |fileset|
 74            # choose(x, k) is the number of ways one can choose `k`
 75            # items from a set of `n`.
 76            size = fileset.size
 77            size += 1 if canon
 78            if size > 1 then
 79                @estimate += size * (size - 1) 
 80                # @estimate += choose(size, 2) 
 81                # @estimate += factorial(size)
 82            end
 83        end
 84        update
 85    end
 86
 87    def factorial(n)
 88        if n <= 1 then 1 else n * factorial(n-1) end
 89    end
 90
 91    def choose(n, k)
 92        if n < 1 or k < 0 or k > n then
 93            return 0
 94        end
 95        result = 1
 96        1.upto(k) do |i|
 97            result = result * (n + 1 - i) / i
 98        end
 99        return result
100    end
101
102    def add(amt = 1)
103        @actual += amt
104        update
105    end
106
107    def progress(increments = 100)
108        if @estimate == 0 then
109            return 0
110        else
111            return Rational(increments * @actual, @estimate).floor.to_i
112        end
113    end
114
115    def progress_bar
116        str = String.new
117        1.upto(BAR_LENGTH) do |i|
118            if i <= progress(BAR_LENGTH) then
119                str << '#'
120            else
121                str << '-'
122            end
123        end
124        return str
125    end
126
127    def update
128        if not @updated then
129            @updated = Time.now
130        end
131
132        now = Time.now
133        if now - @updated > UPDATE_INTERVAL then
134            @updated = now
135            if not quiet then
136                msg = @actual.to_s + "/" + @estimate.to_s + \
137                      " |" + progress_bar + "|"
138                @io.print("\b"*msg.size, msg)
139            end
140        end
141    end
142end
143
144# Whether `path_i` and `path_j` refer to duplicate but not identical files
145def duplicate_files?(path_i, path_j)
146    return (File.exist?(path_i) \
147            and File.exist?(path_j) \
148            and not File.identical?(path_i, path_j) \
149            and FileUtils.cmp(path_i, path_j))
150end
151
152def prunable?(path, prune=[])
153    name = File.basename(path)
154    return (prune.include?(name) or File.fnmatch('._*', name))
155end
156
157# Recurse through array of `dirs` and produce Hash of all files by size.
158def files_by_size(dirs, prune=[], info=false)
159    result = Hash.new {|h,k| h[k]=[]}
160    count = 0
161    spinner = nil 
162    if info then
163        $stderr.print("Looking for files in ", dirs, ": ")
164        spinner = Spinner.new($stderr)
165        spinner.start
166    end
167    Find.find(*dirs) do |path|
168        if prunable?(path, prune) then
169            Find.prune
170        elsif File.file?(path) then
171            size = File.size(path)
172            count = count + 1
173            path.encode!(ENCODING_UTF8) if ENCODING_UTF8
174            result[size] << path
175            if info then
176                count = count + 1
177                spinner.update
178            end
179        end
180    end
181    if info then
182        $stderr.print(" done!\n")
183
184        $stderr.print("Found ", count, " files in ", 
185                      result.size, " size groups.\n")
186    end
187    return result
188end
189
190# Compare each file in `paths` and append lists of equal files to `result`
191def append_duplicates(result, paths, progress)
192    idsets = Hash.new {|h,k| h[k] = { k => true } }
193    0.upto(paths.length - 1) do |i|
194        (i+1).upto(paths.length - 1) do |j|
195            path_i = paths[i]
196            path_j = paths[j]
197            if duplicate_files?(path_i, path_j) then
198                idsets[path_i][path_j] = true
199                idsets[path_j][path_i] = true
200            end
201            progress.add
202        end
203    end
204    idsets.values.uniq.each do | s |
205        result << s.keys.sort
206    end
207    return result
208end
209
210# Find all files in `fmap` and append lists of equal files to `result`
211# `fmap` is a map of files by size.
212def append_all_dupes(result, fmap, opts, progress)
213    fmap.each_pair do |size, paths|
214        if size > 1 and paths.size > 1 then
215            $stderr.print("Comparing ", paths, " ...") if opts.verbose
216            append_duplicates(result, paths, progress)
217            $stderr.print("done.\n") if opts.verbose
218        end
219    end
220end
221
222# Find all files in `srcdir`, compare to all files in `fmap`,
223# and append lists of equal files to `result`.
224# The path from `srcdir` will always be first in the list.
225def append_dir_dupes(result, fmap, opts, progress)
226    # Use names from the pre-rewrite version
227    srcdir = opts.canondir
228    prune = opts.prune
229    verbose = opts.verbose
230
231    $stderr.print("Looking for files in '", srcdir, "':\n") if verbose
232    Find.find(srcdir) do |path_i|
233        if prunable?(path_i, prune) then
234            Find.prune
235        elsif File.file?(path_i) then
236            size = File.size(path_i)
237            paths = fmap[size]
238            if size > 0 and paths and not paths.empty? then
239                dupes = []
240                $stderr.print("Comparing '", path_i, 
241                              "' to ", paths, " ...") if verbose
242                paths.each do |path_j|
243                    if duplicate_files?(path_i, path_j) then
244                        dupes << path_j
245                    end
246                    progress.add
247                end
248                if dupes.size > 0 then
249                   dupes.sort!
250                   result << [path_i, *dupes]
251                end
252                $stderr.print(" done.\n") if verbose
253            end
254        end
255    end
256    $stderr.print("Done!\n") if verbose
257    return result
258end
259
260Options = Struct.new("Options", 
261                     :info, :verbose, :canondir, :dirs, :prune, 
262                     :io, :format, :pretty, :perfdir,
263                     :zero)
264
265def parse_options
266
267    config = Options.new(true, false, nil, [], PRUNE, 
268                            nil, :yaml, false, nil, false)
269
270    outfile = nil
271
272    OptionParser.new do |opts|
273        opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
274
275        opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
276            config.info = (not v)
277        end
278        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
279            config.verbose = v
280        end
281        opts.on("-d", "--from [DIR]", 
282                "Compare arguments only to files from DIR") do |d|
283            config.canondir = d
284        end
285        opts.on("-o", "--output [OUTFILE]", 
286                "Write standard output to OUTFILE") do |f|
287            outfile = File.new(f, 'w')
288        end
289        opts.on("-j", "--json", "Write output as JSON") do |v|
290            config.format = :json
291        end
292        opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
293            config.format = :yaml 
294        end
295        opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
296            config.pretty = v
297        end
298        opts.on("-P", "--perf-data [DIR]", 
299                "Write performace data to a directory (default /tmp)") do |v|
300            config.perfdir = v
301        end
302        opts.on("-z", "--[no-]zero", "Include zero-length files") do |v|
303            config.zero = v
304        end
305    end.parse!
306
307    config.io = (outfile or $stdout)
308
309    config.dirs = ARGV
310
311    return config
312end
313
314def log_perf_data(perfdir, info, data)
315    begin
316        name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
317        path = nil
318        if not perfdir then
319            dir = Dir.mktmpdir('dfperf')
320            path = File.join(dir, name)
321        else
322            Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
323            dir = Dir.new(perfdir)
324            path = File.join(dir.to_path, name)
325        end
326        # TODO: check that file doesn't already exist.
327        File.open(path, 'w') do |file|
328            file.write(JSON.pretty_generate(data))
329        end
330        if info then
331           $stderr.print("Wrote performance data to ", path, "\n") if info
332        end
333    rescue Exception  => msg
334        $stderr.print("Cannot log performance data: ", msg, "\n") if info 
335    end
336end
337
338def run
339    opts = parse_options
340
341    time_start = Time.now
342
343    fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
344
345    time_search = Time.now
346
347    if opts.info then
348        $stderr.print("Comparing files ...\n")
349    end
350
351    progress = Progress.new
352    progress.io=$stderr
353    progress.quiet=(not opts.info)
354
355    time_before_est = Time.now
356
357    progress.do_estimate(fmap, opts.canondir != nil)
358
359    time_estimate = Time.now
360
361    results = []
362    if opts.canondir then
363        append_dir_dupes(results, fmap, opts, progress)
364    else
365        append_all_dupes(results, fmap, opts, progress)
366    end
367
368    time_compare = Time.now
369
370    if opts.zero and fmap[0] then
371        zerofiles = fmap[0].clone
372        zerofiles << ''
373        zerofiles.sort!
374        results << zerofiles
375    end
376
377    if opts.info then
378        $stderr.print("... done!\n")
379    end
380
381    log_perf_data(opts.perfdir, opts.info, {
382        "actual" => progress.actual,
383        "estimate" => progress.estimate,
384        "ngroups" => fmap.size, 
385        "time-compare" => time_compare - time_estimate,
386        "time-estimate" => time_estimate - time_before_est,
387        "time-search" => time_search - time_start,
388        "data" => fmap})
389
390    if opts.format == :json then
391        if opts.pretty then
392            text = JSON.pretty_generate(results)
393        else
394            text = JSON.generate(results)
395        end
396        opts.io.write(text)
397        opts.io.write("\n")
398    else
399        # :line_width setting so file names w/spaces don't break across lines
400        yamlopts = {:line_width => 4096} 
401        if opts.pretty then
402            # {:canonical => true} looks almost identical to pretty-printed 
403            # JSON so the header keeps them distinct.
404            yamlopts[:header] = true
405            yamlopts[:canonical] = true
406        end
407        YAML.dump(results, opts.io, yamlopts)
408    end
409end
410
411run

remove-files.rb

Summary of Changes

Usage

Usage: remove-files.rb [options] [listfile ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -j, --[no-]json                  Read input as JSON
    -y, --[no-]yaml                  Read input as YAML (default)
    -x, --[no-]mixed                 Deduce format from files
    -i, --[no-]interactive           Wait for user approval before deleting.
    -u, --[no-]dry-run               Don't delete, just list files

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'optparse'
  4require 'yaml'
  5require 'psych'
  6require 'json'
  7
  8# Append all but the first paths from each set in `filesets` to results
  9def append_files(result, filesets, verbose=true)
 10  filesets.each do |fset|
 11    delset = fset[1..-1]
 12    # Skip the first file in each set
 13    if verbose then
 14      $stderr.print("Skipping ", fset[0].inspect, ";\n");
 15      $stderr.print("Adding ", delset, "\n")
 16    end
 17    delset.each do |n|
 18      result << n
 19    end
 20  end
 21end
 22
 23# Figure out if the `text` in `path` is JSON
 24def is_json(path, text)
 25  text.strip! # Bad form to modify an argument
 26  path.end_with?(".json") or text.start_with?("{", "[", "\"")
 27end
 28
 29def read_file(files, path, verbose)
 30  $stderr.print("Reading ", path, " ...\n") if verbose
 31  text = f.read
 32  begin
 33    if is_json(path, text) then
 34      $stderr.print("Assuming JSON: ", path) if verbose
 35      append_files(files, JSON.parse(text), verbose)
 36    else
 37      $stderr.print("Assuming YAML: ", path) if verbose
 38      append_files(files, YAML.safe_load(text), verbose)
 39    end
 40  rescue JSON::JsonError => msg
 41    $stderr.print(msg, "\n") if verbose
 42    $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
 43  rescue Psych::Exception => msg
 44    $stderr.print(msg, "\n") if verbose
 45    $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
 46  end
 47end
 48
 49#
 50# Run the main loop
 51#
 52def run
 53
 54  info = true
 55  verbose = false
 56  input = :yaml
 57  dry = false
 58  interactive = false
 59  files = []
 60
 61  OptionParser.new do |opts|
 62    opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
 63
 64    opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
 65      info = (not v)
 66    end
 67    opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
 68      verbose = v
 69    end
 70    opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
 71      input = :json if v
 72    end
 73    opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
 74      input = :yaml if v
 75    end
 76    opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
 77      input = nil if v
 78    end
 79    opts.on("-i", "--[no-]interactive", 
 80            "Wait for user approval before deleting.") do |v|
 81      interactive = v
 82    end
 83    opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
 84      dry = v
 85    end
 86  end.parse!
 87
 88  begin
 89    if input == :json then
 90      # Assuming we're getting a single JSON object ...
 91      result = JSON.parse(ARGF.read)
 92      append_files(files, result, verbose)
 93    elsif input == :yaml or ARGV.empty? then
 94      append_files(files, YAML.safe_load(ARGF.read), verbose)
 95    else
 96      ARGV.each { |path| read_file(files, path, verbose) }
 97    end
 98  rescue Psych::Exception => msg
 99    $stderr.print(msg, "\n") if verbose
100    $stdout.print("Input isn't valid YAML; skipping.\n") if info
101  rescue JSON::JsonError => msg
102    $stderr.print(msg, "\n") if verbose
103    $stdout.print("Input isn't valid JSON; skipping.\n") if info
104  end
105
106  files.uniq!
107  files.sort!
108
109  if info and not files.empty? then
110    if interactive then
111        $stdout.print("About to remove the following files:\n")
112    else
113        $stdout.print("Removing the following files:\n")
114    end
115    YAML.dump(files, $stdout)
116  end
117
118  if interactive and not files.empty? then
119    $stdout.print("Remove ", files.size, " files? [y/N]: ")
120    response = $stdin.gets
121    if not response then
122      $stdout.print("No response; are you piping to STDIN? Exiting.\n")
123      return -1
124    else
125      response.strip!
126      if not response.start_with?('Y', 'y') then
127        $stdout.print("Exiting.\n")
128        return 1
129      end
130    end
131  end
132
133  count = 0
134
135  files.each do |path|
136    $stderr.print("-> rm ", path.inspect, "\n") if verbose
137    begin
138      File.delete(path) if not dry 
139      count = count + 1
140    rescue StandardError => msg
141      $stderr.print(msg, "\n") if verbose
142      $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
143    end
144  end
145
146  $stdout.print("Removed ", count, " files.") if info
147  $stdout.print(" (Not really.)") if info and dry
148  $stdout.print("\n") if info
149
150  return 0
151end
152
153run()