For an explanation, see the directory above.
duplicate-files.rb
Usage
Usage: duplicate-files.rb [options] dir1 [dir2 ...]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-d, --from [DIR] Compare arguments only to files from DIR
-o, --output [OUTFILE] Write standard output to OUTFILE
-j, --json Write output as JSON
-y, --yaml Write output as YAML (default)
-p, --[no-]pretty Pretty-print output
Listing
1#!/usr/bin/env ruby
2
3require 'find'
4require 'fileutils'
5require 'optparse'
6require 'yaml'
7require 'json'
8
9
10# Default list of files to prune in search
11PRUNE = ['.svn', 'CVS', 'CVSROOT', '.DS_Store', '.git']
12
13# Unfortunate artifact of transition between Ruby 1.9 and 2.0
14if self.class.const_defined?(:Encoding) then
15 ENCODING_UTF8 = Encoding.find('UTF-8')
16else
17 ENCODING_UTF8 = nil
18end
19
20# Simple class to manage an ASCII spinner
21class Spinner
22 SPINNER_STATES = ['-', '\\', '|', "/"]
23
24 SPINNER_INTERVAL = 0.5 # s
25
26 def initialize(io=$stderr)
27 @io = io
28 @state = 0
29 @updated = nil
30 end
31
32 def start()
33 @updated = Time.now
34 @io.print(SPINNER_STATES[0])
35 end
36
37 def update()
38 if not @updated then
39 start()
40 end
41
42 now = Time.now
43 if now - @updated > SPINNER_INTERVAL then
44 @updated = now
45 @state = (@state + 1) % SPINNER_STATES.length
46 @io.print("\b", SPINNER_STATES[@state])
47 end
48 end
49end
50
51# Collects duplicate file sets in lieu of an Array,
52# and writes them as Ruby objects
53class DefaultPrinter
54 attr_accessor :io, :pretty
55
56 def initialize
57 @results = []
58 end
59
60 def pretty?
61 return @pretty
62 end
63
64 def << (obj)
65 @results << obj
66 end
67
68 def print_results
69 @results.sort!
70 print(@results)
71 end
72
73 protected
74
75 def print(obj)
76 @io.write(obj)
77 @io.write("\n");
78 end
79end
80
81# Printer class (see above) that serializes as JSON
82class JsonPrinter < DefaultPrinter
83 def print(obj)
84 if @pretty then
85 text = JSON.pretty_generate(obj)
86 else
87 text = JSON.generate(obj)
88 end
89 @io.write(text)
90 @io.write("\n")
91 end
92end
93
94# Printer class (see above) that serializes as YAML.
95class YamlPrinter < DefaultPrinter
96
97 # :line_width setting so file names w/spaces don't break across lines
98 DEFAULT_LINE_WIDTH = 4096
99
100 def print(obj)
101 opts = {:line_width => DEFAULT_LINE_WIDTH}
102 if @pretty then
103 # {:canonical => true} looks almost identical to pretty-printed JSON
104 # so the header keeps them distinct.
105 opts[:header] = true
106 opts[:canonical] = true
107 end
108 YAML.dump(@results, @io, opts)
109 end
110end
111
112
113# Whether `path_i` and `path_j` refer to duplicate but not identical files
114def duplicate_files?(path_i, path_j)
115 return (File.exist?(path_i) \
116 and File.exist?(path_j) \
117 and not File.identical?(path_i, path_j) \
118 and FileUtils.cmp(path_i, path_j))
119end
120
121def prunable?(path, prune=[])
122 name = File.basename(path)
123 return (prune.include?(name) or File.fnmatch('._*', name))
124end
125
126# Recurse through array of `dirs` and produce Hash of all files by size.
127def files_by_size(dirs, prune=[], verbose=false)
128 result = Hash.new {|h,k| h[k]=[]}
129 count = 0
130 spinner = nil
131 if verbose then
132 $stderr.print("Looking for files in ", dirs, ": ")
133 spinner = Spinner.new($stderr)
134 spinner.start
135 end
136 Find.find(*dirs) do |path|
137 if prunable?(path, prune) then
138 Find.prune()
139 elsif File.file?(path) then
140 size = File.size(path)
141 if size > 0 then
142 count = count + 1
143 path.encode!(ENCODING_UTF8) if ENCODING_UTF8
144 result[size] << path if size > 0
145 if verbose then
146 count = count + 1
147 spinner.update
148 end
149 end
150 end
151 end
152 if verbose then
153 $stderr.print(" done!\n")
154
155 $stderr.print("Found ", count, " non-empty files in ",
156 result.size, " size groups.\n")
157 end
158 return result
159end
160
161# Compare each file in `paths` and append lists of equal files to `result`
162def append_duplicates(result, paths)
163 idsets = Hash.new {|h,k| h[k]=[k]}
164 0.upto(paths.length - 1) do |i|
165 (i+1).upto(paths.length - 1) do |j|
166 path_i = paths[i]
167 path_j = paths[j]
168 if duplicate_files?(path_i, path_j) then
169 set_i, set_j = idsets[path_i], idsets[path_j]
170 set_i << path_j
171 set_i.sort!
172 set_j << path_i
173 set_j.sort!
174 end
175 end
176 end
177 idsets.values.uniq.each do | s |
178 result << s
179 end
180 return result
181end
182
183# Find all files in `fmap` and append lists of equal files to `result`
184# `fmap` is a map of files by size.
185def append_all_dupes(result, fmap, verbose=false)
186 fmap.each_pair do |size, paths|
187 if paths.size > 1 then
188 $stderr.print("Comparing ", paths, " ...") if verbose
189 append_duplicates(result, paths)
190 $stderr.print("done.\n") if verbose
191 end
192 end
193end
194
195# Find all files in `srcdir`, compare to all files in `fmap`,
196# and append lists of equal files to `result`.
197# The path from `srcdir` will always be first in the list.
198def append_dir_dupes(result, fmap, srcdir, prune=[], verbose=false)
199 $stderr.print("Looking for files in '", srcdir, "':\n") if verbose
200 Find.find(srcdir) do |path_i|
201 if prunable?(path_i, prune) then
202 Find.prune()
203 elsif File.file?(path_i) then
204 size = File.size(path_i)
205 paths = fmap[size]
206 if size > 0 and paths and not paths.empty? then
207 dupes = []
208 $stderr.print("Comparing '", path_i,
209 "' to ", paths, " ...") if verbose
210 paths.each do |path_j|
211 if duplicate_files?(path_i, path_j) then
212 dupes << path_j
213 end
214 end
215 if dupes.size > 0 then
216 dupes.sort!
217 result << [path_i, *dupes]
218 end
219 $stderr.print(" done.\n") if verbose
220 end
221 end
222 end
223 $stderr.print("Done!\n") if verbose
224 return result
225end
226
227def parse_options(config)
228
229 outfile = nil
230 pclass = YamlPrinter
231 pretty = false
232
233 OptionParser.new do |opts|
234 opts.banner = "Usage: duplicate-files.rb [options] dir1 [dir2 ...]"
235
236 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
237 config.info = (not v)
238 end
239 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
240 config.verbose = v
241 end
242 opts.on("-d", "--from [DIR]",
243 "Compare arguments only to files from DIR") do |d|
244 config.canondir = d
245 end
246 opts.on("-o", "--output [OUTFILE]",
247 "Write standard output to OUTFILE") do |f|
248 outfile = File.new(f, 'w')
249 end
250 opts.on("-j", "--json", "Write output as JSON") do |v|
251 pclass = JsonPrinter
252 end
253 opts.on("-y", "--yaml", "Write output as YAML (default)") do |v|
254 pclass = YamlPrinter
255 end
256 opts.on("-p", "--[no-]pretty", "Pretty-print output") do |v|
257 pretty = v
258 end
259 end.parse!
260
261 printer = pclass.new
262
263 printer.io = (outfile or $stdout)
264 printer.pretty = pretty
265
266 config.printer = printer
267 config.dirs = ARGV
268end
269
270Options = Struct.new("Options",
271 :info, :verbose, :canondir, :dirs, :prune, :printer)
272
273def run
274 opts = Options.new(true, false, nil, [], PRUNE, nil)
275
276 parse_options(opts)
277
278 fmap = files_by_size(opts.dirs, opts.prune, (opts.info or opts.verbose))
279
280 if opts.info then
281 $stderr.print("Comparing files ...")
282 end
283
284 if opts.canondir then
285 append_dir_dupes(opts.printer, fmap, opts.canondir,
286 opts.prune, opts.verbose)
287 else
288 append_all_dupes(opts.printer, fmap, opts.verbose)
289 end
290
291 if opts.info then
292 $stderr.print(" done!\n")
293 end
294
295 opts.printer.print_results
296end
297
298run()
remove-files.rb
Usage
Usage: remove-files.rb [options] [file1 [file2 ...]]
-q, --[no-]quiet Run absolutely quietly
-v, --[no-]verbose Run verbosely
-j, --json [INPUT] Read paths from JSON file
-y, --yaml [INPUT] Read paths from YAML file
Listing
1#!/usr/bin/env ruby
2
3require 'optparse'
4require 'yaml'
5require 'json'
6
7
8def append_files(result, filesets, verbose=true)
9 filesets.each do |fset|
10 delset = fset[1..-1]
11 # Skip the first file in each set
12 if verbose then
13 $stderr.print("Skipping ", fset[0].inspect, ";\n");
14 $stderr.print("Adding ", delset, "\n")
15 end
16 delset.each do |n|
17 result << n
18 end
19 end
20end
21
22def run
23
24 info = true
25 verbose = false
26 interactive = false
27 files = []
28 jsonconf = []
29 yamlconf = []
30
31 OptionParser.new do |opts|
32 opts.banner = "Usage: remove-files.rb [options] [file1 [file2 ...]]"
33
34 opts.on("-q", "--[no-]quiet", "Run absolutely quietly") do |v|
35 info = (not v)
36 end
37 opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
38 verbose = v
39 end
40 opts.on("-j", "--json [INPUT]", "Read paths from JSON file") do |v|
41 jsonconf << v
42 end
43 opts.on("-y", "--yaml [INPUT]", "Read paths from YAML file") do |v|
44 yamlconf << v
45 end
46 end.parse!
47
48 jsonconf.each do |path|
49 $stderr.print("Reading ", path, " ...\n") if verbose
50 File.open(path) do |f|
51 append_files(files, JSON.parse(f.read), verbose)
52 end
53 end
54
55 yamlconf.each do |path|
56 $stderr.print("Reading ", path, " ...\n") if verbose
57 append_files(files, YAML.load_file(path), verbose)
58 end
59
60 ARGV.each do |path|
61 files << path
62 end
63
64 if verbose then
65 $stderr.print("Files to remove:\n")
66 files.each do |path|
67 $stderr.print("- ", path.inspect, "\n");
68 end
69 end
70
71 count = 0
72
73 files.each do |path|
74 $stderr.print("-> rm ", path.inspect, "\n") if verbose
75 begin
76 File.delete(path)
77 count = count + 1
78 rescue StandardError => msg
79 $stderr.print(msg, "\n") if verbose
80 $stderr.print("Cannot remove ", path.inspect, "; skipping.\n") if info
81 end
82 end
83
84 $stderr.print("Removed ", count, " files.\n") if info
85
86end
87
88run()