For an explanation, see the directory above.
duplicate-files.py
Summary of Changes
- Add type annotations
- Rework code for type safety.
- Ran code through
mypy
andpylint
, and fixed most problems.pylint
objects to the hyphen in the name.pylint
wants docstrings for all function.
- Removed attempt to import
simplejson
Usage
usage: duplicate-files.py [-h] [-q, --quiet] [-v, --verbose]
[-o, --output FILE] [-j, --json] [-y, --yaml]
[-p, --pretty] [-z, --zero-length]
DIR [DIR ...]
Find duplicate files in one or more directories.
positional arguments:
DIR a directory to scan
options:
-h, --help show this help message and exit
-q, --quiet run without printing to standard output.
-v, --verbose print extra information to standard output.
-o, --output FILE write to the given file (default: standard output)
-j, --json write output as JSON
-y, --yaml write output as YAML (default)
-p, --pretty "pretty-print" the output
-z, --zero-length include zero-length files
Listing
#!/usr/bin/env python3
from pathlib import Path
from itertools import combinations
import argparse
import filecmp
import fnmatch
import json
YAML_ENABLED: bool = True
try:
import yaml
except ImportError:
YAML_ENABLED = False
STD_EXCLUDES: list[str] = [
# version control directories
'CVS', 'CVSROOT', '.git', '.gitignore', '.svn',
# macosx-specific hidden files
'.DS_Store', '.Apple*', '._*',
# trash directories
'Trash', '.Trash*',
]
def prunable(name: str) -> bool:
return any(fnmatch.fnmatch(name, pat) for pat in STD_EXCLUDES)
def find_files(sizemap: dict[int, set[Path]], name: str | Path) -> None:
f: Path
if isinstance(name, Path):
f = name
else:
f = Path(name)
if prunable(f.name):
return
if f.is_symlink():
return
if f.is_dir():
for child in f.iterdir():
find_files(sizemap, child)
elif f.is_file():
sz: int = f.stat().st_size
if sz in sizemap:
sizemap[sz].add(f)
else:
sizemap[sz] = set((f,))
def duplicate_files(file_i, file_j) -> bool:
return file_i != file_j \
and not file_i.samefile(file_j) \
and filecmp.cmp(file_i, file_j, shallow=False)
def add_to_dupsets(dupsets: dict[Path, frozenset[Path]], file_i: Path, file_j: Path) -> None:
if not file_i in dupsets:
dupsets[file_i] = frozenset((file_i,))
if not file_j in dupsets:
dupsets[file_j] = frozenset((file_j,))
newset = dupsets[file_i] | dupsets[file_j]
for f in newset:
dupsets[f] = newset
def compare_files(sizemap: dict[int, set[Path]]) -> list[list[str]]:
dupsets: dict[Path, frozenset[Path]] = {}
for sz, ls in sizemap.items():
if sz == 0:
continue
for file_i, file_j in combinations(ls, 2):
if duplicate_files(file_i, file_j):
add_to_dupsets(dupsets, file_i, file_j)
superset: set[frozenset[Path]] = set(dupsets.values())
result: list[list[str]] = [sorted(str(p) for p in s) for s in superset]
result.sort()
return result
def make_argparser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description='Find duplicate files in one or more directories.')
parser.add_argument('dirs',
metavar='DIR',
nargs='+',
help='a directory to scan')
#parser.add_argument('-d, --from',
# metavar='MAINDIR',
# dest='canondir',
# help='Compare other directories to this directory.')
parser.add_argument('-q, --quiet',
action='store_true',
dest='quiet',
help='run without printing to standard output.')
parser.add_argument('-v, --verbose',
action='store_true',
dest='verbose',
help='print extra information to standard output.')
parser.add_argument('-o, --output',
metavar='FILE',
dest='outfile',
help='write to the given file (default: standard output)')
if YAML_ENABLED:
parser.add_argument('-j, --json',
action='store_const',
const='json',
default='yaml',
dest='format',
help='write output as JSON')
parser.add_argument('-y, --yaml',
action='store_const',
const='yaml',
default='yaml',
dest='format',
help='write output as YAML (default)')
parser.add_argument('-p, --pretty',
action='store_true',
dest='pretty',
help='"pretty-print" the output')
parser.add_argument('-z, --zero-length',
action='store_true',
dest='zero',
help='include zero-length files')
return parser
def main() -> None:
sizemap: dict[int, set[Path]] = {}
parser: argparse.ArgumentParser = make_argparser()
args: argparse.Namespace = parser.parse_args()
for dirname in args.dirs:
find_files(sizemap, dirname)
result: list[list[str]] = compare_files(sizemap)
if args.zero and 0 in sizemap:
zero: list[str] = [str(p) for p in sizemap[0]]
zero.append('') # so remove-files will remove *all* files
result.append(zero)
if not YAML_ENABLED or args.format == 'json':
if args.pretty:
indent = 2
else:
indent = None
out = \
json.dumps(result, sort_keys=True, indent=indent)
else:
out = \
yaml.dump(result,
width=4096,
explicit_start=True,
explicit_end=args.pretty,
canonical=args.pretty)
if not args.outfile:
print(out)
else:
outpath = Path(args.outfile)
outpath.write_text(out, encoding='utf-8')
if __name__ == '__main__':
main()