duplicate-files.rb - 2023-03-07

Posted: 2023-03-07
Word Count: 2094
Tags: programming ruby

Table of Contents

For an explanation see the directory above.

Highlighting shows changes from the previous version.

duplicate-files.rb

Summary of Changes

Usage

Usage: duplicate-files.rb [options] dir1 [dir2 ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -d, --from DIR                   Compare arguments to files from DIR (deprecated)
    -o, --output OUTFILE             Write standard output to OUTFILE
    -j, --json                       Write output as JSON
    -y, --yaml                       Write output as YAML (default)
    -p, --[no-]pretty                Pretty-print output
    -P, --perf-data DIR              Write performance data to DIR
    -z, --[no-]zero                  Include zero-length files

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'find'
  4require 'fileutils'
  5require 'optparse'
  6require 'pathname'
  7require 'rational'
  8require 'tempfile'
  9require 'yaml'
 10require 'json'
 11
 12
 13# List of file patterns to prune in search
 14PRUNE = [
 15  # Version Control directories and files
 16  /CVS/, /CVSROOT/, /\.git/, /\.gitignore/, /\.svn/,
 17  # Trash folders, usually off user or media root directories
 18  /Trash/, /\.Trash.*/,
 19  # Apple Mac metadata files and folders
 20  /\.DS_Store/, /\._.*/, /\.Apple.*/
 21]
 22
 23# Unfortunate artifact of transition between Ruby 1.9 and 2.0
 24if self.class.const_defined?(:Encoding) then
 25    ENCODING_UTF8 = Encoding.find('UTF-8')
 26else
 27    ENCODING_UTF8 = nil
 28end
 29
 30# Simple class to manage an ASCII spinner
 31class Spinner
 32    SPINNER_STATES = ['-', '\\', '|', "/"]
 33
 34    UPDATE_INTERVAL = 0.5 # s
 35
 36    def initialize(io=$stderr)
 37        @io = io
 38        @state = 0
 39        @updated = nil
 40    end
 41
 42    def start
 43        @updated = Time.now
 44        @io.print(SPINNER_STATES[0])
 45    end
 46
 47    def update
 48        if not @updated then
 49            start
 50        end
 51
 52        now = Time.now
 53        if now - @updated > UPDATE_INTERVAL then
 54            @updated = now
 55            @state = (@state + 1) % SPINNER_STATES.length
 56            @io.print("\b", SPINNER_STATES[@state])
 57        end
 58    end
 59
 60    def stop
 61        @io.print("\b ")
 62    end
 63end
 64
 65def prunable?(path)
 66    name = File.basename(path)
 67    PRUNE.each do |p|
 68       return true if p =~ name
 69    end
 70    return false
 71end
 72
 73# Recurse through array of `dirs` and produce Hash of all files by size.
 74def files_by_size(fmap, dirs, spinner=nil)
 75    Find.find(*dirs) do |path|
 76        if prunable?(path) then
 77            Find.prune
 78        elsif File.file?(path) then
 79            size = File.size(path)
 80            path.encode!(ENCODING_UTF8) if ENCODING_UTF8
 81            if not fmap[size] then
 82                fmap[size] = {}
 83            end
 84            fmap[size][Pathname(path)] = true
 85            spinner.update if spinner
 86        end
 87    end
 88end
 89
 90# Compare each file in `paths` and append lists of equal files to `fmap`
 91def append_duplicates(result, paths, progress, verbose=false)
 92    idsets = Hash.new {|h,k| h[k] = { k => true } }
 93
 94    paths.keys.combination(2) do |path_i, path_j|
 95        if path_i.exist? and path_j.exist? \
 96                and path_i.realpath != path_j.realpath then
 97            $stderr.print("Comparing <", path_i, 
 98                          "> and <", path_j, "> ...") if verbose
 99            if FileUtils.cmp(path_i, path_j) then
100                idsets[path_i][path_j] = true
101                idsets[path_j][path_i] = true
102            end
103            progress.update if progress
104            $stderr.print(" done!\n") if verbose
105        end
106    end
107    idsets.values.uniq.each do |s|
108        result << s.keys.sort
109    end
110    result.sort!
111    return result
112end
113
114# Find all files in `fmap` and append lists of equal files to `result`
115# `fmap` is a map of files by size.
116def append_all_dupes(result, fmap, opts, progress)
117    progress.start if progress
118    fmap.each_pair do |size, paths|
119        if size > 1 and paths.size > 1 then
120            append_duplicates(result, paths, progress, opts.verbose)
121        end
122    end
123    progress.stop if progress
124end
125
126Options = Struct.new("Options", 
127                     :info, :verbose, :canondir, :dirs,
128                     :io, :format, :pretty, :perfdir,
129                     :zero)
130
131def parse_options
132
133    config = Options.new(true, false, nil, [], 
134                            nil, :yaml, false, nil, false)
135
136    outfile = nil
137
138    OptionParser.new do |opts|
139        opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
140
141        opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
142            config.info = (not v)
143        end
144        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
145            config.verbose = v
146        end
147        opts.on("-d", "--from DIR", 
148                "Compare arguments to files from DIR (deprecated)") do |d|
149            config.canondir = d
150        end
151        opts.on("-o", "--output OUTFILE", 
152                "Write standard output to OUTFILE") do |f|
153            outfile = File.new(f, 'w')
154        end
155        opts.on("-j", "--json", "Write output as JSON") do |v|
156            config.format = :json
157        end
158        opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
159            config.format = :yaml 
160        end
161        opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
162            config.pretty = v
163        end
164        opts.on("-P", "--perf-data DIR", 
165                "Write performance data to DIR") do |v|
166            config.perfdir = v
167        end
168        opts.on("-z", "--[no-]zero", "Include zero-length files") do |v|
169            config.zero = v
170        end
171    end.parse!
172
173    config.io = (outfile or $stdout)
174
175    config.dirs = ARGV
176
177    return config
178end
179
180def log_perf_data(perfdir, info, data)
181    begin
182        name = 'perf-' + Time.now.strftime('%Y-%m-%d-%H%M%S-%N') + '.json'
183        path = nil
184        if not perfdir then
185            dir = Dir.mktmpdir('dfperf')
186            path = File.join(dir, name)
187        else
188            Dir.mkdir(perfdir, 755) if not Dir.exist?(perfdir)
189            dir = Dir.new(perfdir)
190            path = File.join(dir.to_path, name)
191        end
192        # TODO: check that file doesn't already exist.
193        File.open(path, 'w') do |file|
194            file.write(JSON.pretty_generate(data))
195        end
196        if info then
197           $stderr.print("Wrote performance data to ", path, "\n") if info
198        end
199    rescue Exception  => msg
200        $stderr.print("Cannot log performance data: ", msg, "\n") if info 
201    end
202end
203
204def run
205    opts = parse_options
206
207    spinner = nil 
208    if opts.info or opts.verbose then
209        $stderr.print("Looking for files in ", opts.dirs, " ... ")
210        spinner = Spinner.new($stderr)
211        spinner.start
212    end
213
214    time_start = Time.now
215
216    fmap = {}
217    files_by_size(fmap, opts.dirs, spinner)
218    if opts.canondir then
219        files_by_size(fmap, [opts.canondir], spinner)
220    end
221
222    time_search = Time.now
223
224    if spinner then 
225        spinner.stop
226        $stderr.print("done!\n")
227        count = 0
228        fmap.each_value do |v|
229            count += v.size
230        end
231        $stderr.print("Found ", count, " files in ", 
232                      fmap.size, " size groups.\n")
233    end
234
235    if opts.info then
236        $stderr.print("Comparing files ... ")
237    end
238
239    time_before_compare = Time.now
240
241    results = []
242    append_all_dupes(results, fmap, opts, spinner)
243
244    time_compare = Time.now
245
246    if opts.info then
247        $stderr.print("done!\n")
248    end
249
250    if opts.zero and fmap[0] then
251        zerofiles = fmap[0].keys.clone
252        zerofiles << Pathname('')
253        zerofiles.sort!
254        results << zerofiles
255        results.sort!
256    end
257
258    if opts.canondir then
259        bubble_up(opts.canondir, results)
260    end
261
262    if opts.perfdir then
263        log_perf_data(opts.perfdir, opts.info, {
264            "ngroups" => fmap.size, 
265            "time-compare" => time_compare - time_before_compare,
266            "time-search" => time_search - time_start,
267            "data" => fmap})
268    end
269
270    if opts.format == :json then
271        if opts.pretty then
272            text = JSON.pretty_generate(results)
273        else
274            text = JSON.generate(results)
275        end
276        opts.io.write(text)
277        opts.io.write("\n")
278    else
279        # :line_width setting so file names w/spaces don't break across lines
280        yamlopts = {:line_width => 4096} 
281        if opts.pretty then
282            # {:canonical => true} looks almost identical to pretty-printed 
283            # JSON so the header keeps them distinct.
284            yamlopts[:header] = true
285            yamlopts[:canonical] = true
286        end
287        # Using Pathname upstream means exposing it in YAML;
288        # build new results with just Strings and Arrays
289        saferesults = []
290        results.each do |set|
291            safeset = []
292            saferesults << safeset
293            set.each do |path|
294                safeset << path.to_s
295            end
296        end
297        YAML.safe_dump(saferesults, opts.io, yamlopts)
298    end
299end
300
301def bubble_up(canondir, results)
302  cpath = Pathname(canondir)
303
304  results.each do |ls| 
305    ls.sort! do |a, b| 
306      canon_first(cpath, a, b) 
307    end 
308  end
309end
310
311def canon_first(cpath, a, b)
312  a_in_dir = is_parent?(cpath, a)
313  b_in_dir = is_parent?(cpath, b)
314
315  if a_in_dir == b_in_dir then
316    # how we originally sorted them
317    return Pathname(a) <=> Pathname(b)
318  elsif a_in_dir then
319    return -1
320  else
321    return 1
322  end
323end
324
325def is_parent?(cpath, path)
326  result = false
327  path.ascend do |p|
328    if cpath == p then
329      result = true
330      return true
331    end
332  end
333  return result
334end
335
336run

remove-files.rb

Summary of Changes

None.

Usage

Usage: remove-files.rb [options] [listfile ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -j, --[no-]json                  Read input as JSON
    -y, --[no-]yaml                  Read input as YAML (default)
    -x, --[no-]mixed                 Deduce format from files
    -i, --[no-]interactive           Wait for user approval before deleting.
    -u, --[no-]dry-run               Don't delete, just list files

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'optparse'
  4require 'yaml'
  5require 'psych'
  6require 'json'
  7
  8# Append all but the first paths from each set in `filesets` to results
  9def append_files(result, filesets, verbose=true)
 10  filesets.each do |fset|
 11    delset = fset[1..-1]
 12    # Skip the first file in each set
 13    if verbose then
 14      $stderr.print("Skipping ", fset[0].inspect, ";\n");
 15      $stderr.print("Adding ", delset, "\n")
 16    end
 17    delset.each do |n|
 18      result << n
 19    end
 20  end
 21end
 22
 23# Figure out if the `text` in `path` is JSON
 24def is_json(path, text)
 25  text.strip! # Bad form to modify an argument
 26  path.end_with?(".json") or text.start_with?("{", "[", "\"")
 27end
 28
 29def read_file(files, path, verbose)
 30  $stderr.print("Reading ", path, " ...\n") if verbose
 31  text = f.read
 32  begin
 33    if is_json(path, text) then
 34      $stderr.print("Assuming JSON: ", path) if verbose
 35      append_files(files, JSON.parse(text), verbose)
 36    else
 37      $stderr.print("Assuming YAML: ", path) if verbose
 38      append_files(files, YAML.safe_load(text), verbose)
 39    end
 40  rescue JSON::JsonError => msg
 41    $stderr.print(msg, "\n") if verbose
 42    $stdout.print(path.inspect, "isn't valid JSON; skipping.\n") if info
 43  rescue Psych::Exception => msg
 44    $stderr.print(msg, "\n") if verbose
 45    $stdout.print(path.inspect, "isn't valid YAML; skipping.\n") if info
 46  end
 47end
 48
 49#
 50# Run the main loop
 51#
 52def run
 53
 54  info = true
 55  verbose = false
 56  input = :yaml
 57  dry = false
 58  interactive = false
 59  files = []
 60
 61  OptionParser.new do |opts|
 62    opts.banner = "Usage: remove-files.rb [options] [listfile ...]"
 63
 64    opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
 65      info = (not v)
 66    end
 67    opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
 68      verbose = v
 69    end
 70    opts.on("-j", "--[no-]json", "Read input as JSON") do |v|
 71      input = :json if v
 72    end
 73    opts.on("-y", "--[no-]yaml", "Read input as YAML (default)") do |v|
 74      input = :yaml if v
 75    end
 76    opts.on("-x", "--[no-]mixed", "Deduce format from files") do |v|
 77      input = nil if v
 78    end
 79    opts.on("-i", "--[no-]interactive", 
 80            "Wait for user approval before deleting.") do |v|
 81      interactive = v
 82    end
 83    opts.on("-u", "--[no-]dry-run", "Don't delete, just list files") do |v|
 84      dry = v
 85    end
 86  end.parse!
 87
 88  begin
 89    if input == :json then
 90      # Assuming we're getting a single JSON object ...
 91      result = JSON.parse(ARGF.read)
 92      append_files(files, result, verbose)
 93    elsif input == :yaml or ARGV.empty? then
 94      append_files(files, YAML.safe_load(ARGF.read), verbose)
 95    else
 96      ARGV.each { |path| read_file(files, path, verbose) }
 97    end
 98  rescue Psych::Exception => msg
 99    $stderr.print(msg, "\n") if verbose
100    $stdout.print("Input isn't valid YAML; skipping.\n") if info
101  rescue JSON::JsonError => msg
102    $stderr.print(msg, "\n") if verbose
103    $stdout.print("Input isn't valid JSON; skipping.\n") if info
104  end
105
106  files.uniq!
107  files.sort!
108
109  if info and not files.empty? then
110    if interactive then
111        $stdout.print("About to remove the following files:\n")
112    else
113        $stdout.print("Removing the following files:\n")
114    end
115    YAML.dump(files, $stdout)
116  end
117
118  if interactive and not files.empty? then
119    $stdout.print("Remove ", files.size, " files? [y/N]: ")
120    response = $stdin.gets
121    if not response then
122      $stdout.print("No response; are you piping to STDIN? Exiting.\n")
123      return -1
124    else
125      response.strip!
126      if not response.start_with?('Y', 'y') then
127        $stdout.print("Exiting.\n")
128        return 1
129      end
130    end
131  end
132
133  count = 0
134
135  files.each do |path|
136    $stderr.print("-> rm ", path.inspect, "\n") if verbose
137    begin
138      File.delete(path) if not dry 
139      count = count + 1
140    rescue StandardError => msg
141      $stderr.print(msg, "\n") if verbose
142      $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
143    end
144  end
145
146  $stdout.print("Removed ", count, " files.") if info
147  $stdout.print(" (Not really.)") if info and dry
148  $stdout.print("\n") if info
149
150  return 0
151end
152
153run()

  1. Previous versions had no options to add or delete from the list, but it was part of the Options struct regardless. ↩︎