duplicate-files.rb - 2022-11-26

Posted: 2022-11-26
Word Count: 1506
Tags: programming ruby

Table of Contents

For an explanation, see the directory above.

duplicate-files.rb

Usage

Usage: duplicate-files.rb [options] dir1 [dir2 ...]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -d, --from [DIR]                 Compare arguments only to files from DIR
    -o, --output [OUTFILE]           Write standard output to OUTFILE
    -j, --json                       Write output as JSON
    -y, --yaml                       Write output as YAML (default)
    -p, --[no-]pretty                Pretty-print output

Listing

  1#!/usr/bin/env ruby 
  2
  3require 'find'
  4require 'fileutils'
  5require 'optparse'
  6require 'yaml'
  7require 'json'
  8
  9
 10# Default list of files to prune in search
 11PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
 12
 13# Unfortunate artifact of transition between Ruby 1.9 and 2.0
 14if self.class.const_defined?(:Encoding) then
 15    ENCODING_UTF8 = Encoding.find('UTF-8')
 16else
 17    ENCODING_UTF8 = nil
 18end
 19
 20# Simple class to manage an ASCII spinner
 21class Spinner
 22    SPINNER_STATES = ['-', '\\', '|', "/"]
 23
 24    SPINNER_INTERVAL = 0.5 # s
 25
 26    def initialize(io=$stderr)
 27        @io = io
 28        @state = 0
 29        @updated = nil
 30    end
 31
 32    def start()
 33        @updated = Time.now
 34        @io.print(SPINNER_STATES[0])
 35    end
 36
 37    def update()
 38        if not @updated then
 39            start()
 40        end
 41
 42        now = Time.now
 43        if now - @updated > SPINNER_INTERVAL then
 44            @updated = now
 45            @state = (@state + 1) % SPINNER_STATES.length
 46            @io.print("\b", SPINNER_STATES[@state])
 47        end
 48    end
 49end
 50
 51# Collects duplicate file sets in lieu of an Array, 
 52# and writes them as Ruby objects
 53class DefaultPrinter
 54    attr_accessor :io, :pretty
 55
 56    def initialize
 57        @results = []
 58    end
 59
 60    def pretty?
 61        return @pretty
 62    end
 63
 64    def << (obj)
 65        @results << obj
 66    end
 67
 68    def print_results
 69        @results.sort!
 70        print(@results)
 71    end
 72
 73    protected
 74
 75    def print(obj)
 76        @io.write(obj)
 77        @io.write("\n");
 78    end
 79end
 80
 81# Printer class (see above) that serializes as JSON
 82class JsonPrinter < DefaultPrinter
 83    def print(obj)
 84        if @pretty then
 85            text = JSON.pretty_generate(obj)
 86        else
 87            text = JSON.generate(obj)
 88        end
 89        @io.write(text)
 90        @io.write("\n")
 91    end
 92end
 93
 94# Printer class (see above) that serializes as YAML.
 95class YamlPrinter < DefaultPrinter
 96
 97    # :line_width setting so file names w/spaces don't break across lines
 98    DEFAULT_LINE_WIDTH = 4096
 99
100    def print(obj)
101        opts = {:line_width => DEFAULT_LINE_WIDTH} 
102        if @pretty then
103            # {:canonical => true} looks almost identical to pretty-printed JSON
104            # so the header keeps them distinct.
105            opts[:header] = true
106            opts[:canonical] = true
107        end
108        YAML.dump(@results, @io, opts)
109    end
110end
111
112
113# Whether `path_i` and `path_j` refer to duplicate but not identical files
114def duplicate_files?(path_i, path_j)
115    return (File.exist?(path_i) \
116            and File.exist?(path_j) \
117            and not File.identical?(path_i, path_j) \
118            and FileUtils.cmp(path_i, path_j))
119end
120
121def prunable?(path, prune=[])
122    name = File.basename(path)
123    return (prune.include?(name) or File.fnmatch('._*', name))
124end
125
126# Recurse through array of `dirs` and produce Hash of all files by size.
127def files_by_size(dirs, prune=[], verbose=false)
128    result = Hash.new {|h,k| h[k]=[]}
129    count = 0
130    spinner = nil 
131    if verbose then
132        $stderr.print("Looking for files in ", dirs, ": ")
133        spinner = Spinner.new($stderr)
134        spinner.start
135    end
136    Find.find(*dirs) do |path|
137        if prunable?(path, prune) then
138            Find.prune()
139        elsif File.file?(path) then
140            size = File.size(path)
141            if size > 0 then
142                count = count + 1
143                path.encode!(ENCODING_UTF8) if ENCODING_UTF8
144                result[size] << path if size > 0
145                if verbose then
146                    count = count + 1
147                    spinner.update
148                end
149            end
150        end
151    end
152    if verbose then
153        $stderr.print(" done!\n")
154
155        $stderr.print("Found ", count, " non-empty files in ", 
156                      result.size, " size groups.\n")
157    end
158    return result
159end
160
161# Compare each file in `paths` and append lists of equal files to `result`
162def append_duplicates(result, paths)
163    idsets = Hash.new {|h,k| h[k]=[k]}
164    0.upto(paths.length - 1) do |i|
165        (i+1).upto(paths.length - 1) do |j|
166            path_i = paths[i]
167            path_j = paths[j]
168            if duplicate_files?(path_i, path_j) then
169                set_i, set_j = idsets[path_i], idsets[path_j]
170                set_i << path_j
171                set_i.sort!
172                set_j << path_i
173                set_j.sort!
174            end
175        end
176    end
177    idsets.values.uniq.each do | s |
178        result << s
179    end
180    return result
181end
182
183# Find all files in `fmap` and append lists of equal files to `result`
184# `fmap` is a map of files by size.
185def append_all_dupes(result, fmap, verbose=false)
186    fmap.each_pair do |size, paths|
187        if paths.size > 1 then
188            $stderr.print("Comparing ", paths, " ...") if verbose
189            append_duplicates(result, paths)
190            $stderr.print("done.\n") if verbose
191        end
192    end
193end
194
195# Find all files in `srcdir`, compare to all files in `fmap`,
196# and append lists of equal files to `result`.
197# The path from `srcdir` will always be first in the list.
198def append_dir_dupes(result, fmap, srcdir, prune=[], verbose=false)
199    $stderr.print("Looking for files in '", srcdir, "':\n") if verbose
200    Find.find(srcdir) do |path_i|
201        if prunable?(path_i, prune) then
202            Find.prune()
203        elsif File.file?(path_i) then
204            size = File.size(path_i)
205            paths = fmap[size]
206            if size > 0 and paths and not paths.empty? then
207                dupes = []
208                $stderr.print("Comparing '", path_i, 
209                              "' to ", paths, " ...") if verbose
210                paths.each do |path_j|
211                    if duplicate_files?(path_i, path_j) then
212                        dupes << path_j
213                    end
214                end
215                if dupes.size > 0 then
216                   dupes.sort!
217                   result << [path_i, *dupes]
218                end
219                $stderr.print(" done.\n") if verbose
220            end
221        end
222    end
223    $stderr.print("Done!\n") if verbose
224    return result
225end
226
227def parse_options(config)
228    
229    outfile = nil
230    pclass = YamlPrinter
231    pretty = false
232
233    OptionParser.new do |opts|
234        opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
235
236        opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
237            config.info = (not v)
238        end
239        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
240            config.verbose = v
241        end
242        opts.on("-d", "--from [DIR]", 
243                "Compare arguments only to files from DIR") do |d|
244            config.canondir = d
245        end
246        opts.on("-o", "--output [OUTFILE]", 
247                "Write standard output to OUTFILE") do |f|
248            outfile = File.new(f, 'w')
249        end
250        opts.on("-j", "--json", "Write output as JSON") do |v|
251            pclass = JsonPrinter
252        end
253        opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
254            pclass = YamlPrinter 
255        end
256        opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
257            pretty = v
258        end
259    end.parse!
260
261    printer = pclass.new
262
263    printer.io = (outfile or $stdout)
264    printer.pretty = pretty
265
266    config.printer = printer
267    config.dirs = ARGV
268end
269
270Options = Struct.new("Options", 
271                     :info, :verbose, :canondir, :dirs, :prune, :printer)
272
273def run
274    opts = Options.new(true, false, nil, [], PRUNE, nil)
275
276    parse_options(opts)
277
278    fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
279
280    if opts.info then
281        $stderr.print("Comparing files ...")
282    end
283
284    if opts.canondir then
285        append_dir_dupes(opts.printer, fmap, opts.canondir, 
286                         opts.prune, opts.verbose)
287    else
288        append_all_dupes(opts.printer, fmap, opts.verbose)
289    end
290
291    if opts.info then
292        $stderr.print(" done!\n")
293    end
294
295    opts.printer.print_results
296end
297
298run()

remove-files.rb

Usage

Usage: remove-files.rb [options] [file1 [file2 ...]]
    -q, --[no-]quiet                 Run absolutely quietly
    -v, --[no-]verbose               Run verbosely
    -j, --json [INPUT]               Read paths from JSON file
    -y, --yaml [INPUT]               Read paths from YAML file

Listing

 1#!/usr/bin/env ruby 
 2
 3require 'optparse'
 4require 'yaml'
 5require 'json'
 6
 7
 8def append_files(result, filesets, verbose=true)
 9    filesets.each do |fset|
10        delset = fset[1..-1]
11        # Skip the first file in each set
12        if verbose then
13            $stderr.print("Skipping ", fset[0].inspect, ";\n");
14            $stderr.print("Adding ", delset, "\n")
15        end
16        delset.each do |n|
17            result << n
18        end
19    end
20end
21
22def run
23
24    info = true
25    verbose = false
26    interactive = false
27    files = []
28    jsonconf = []
29    yamlconf = []
30 
31    OptionParser.new do |opts|
32        opts.banner = "Usage: remove-files.rb [options] [file1 [file2 ...]]"
33
34        opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
35            info = (not v)
36        end
37        opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
38            verbose = v
39        end
40        opts.on("-j", "--json [INPUT]", "Read paths from JSON file") do |v|
41            jsonconf << v
42        end
43        opts.on("-y", "--yaml [INPUT]", "Read paths from YAML file") do |v|
44            yamlconf << v
45        end
46    end.parse!
47
48    jsonconf.each do |path|
49        $stderr.print("Reading ", path, " ...\n") if verbose
50        File.open(path) do |f|
51            append_files(files, JSON.parse(f.read), verbose)
52        end
53    end
54
55    yamlconf.each do |path|
56        $stderr.print("Reading ", path, " ...\n") if verbose
57        append_files(files, YAML.load_file(path), verbose)
58    end
59
60    ARGV.each do |path|
61        files << path
62    end
63
64    if verbose then
65        $stderr.print("Files to remove:\n")
66        files.each do |path|
67            $stderr.print("- ", path.inspect, "\n");
68        end
69    end
70
71    count = 0
72
73    files.each do |path|
74        $stderr.print("-> rm ", path.inspect, "\n") if verbose
75        begin
76           File.delete(path)
77           count = count + 1
78        rescue StandardError => msg
79           $stderr.print(msg, "\n") if verbose
80           $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
81        end
82    end
83
84    $stderr.print("Removed ", count, " files.\n") if info
85
86end
87
88run()