duplicate-files.py - 2024-04-07

Frank Mitchell

Posted: 2024-04-07
Word Count: 574
Tags: programming python python-code

Table of Contents

For an explanation, see the directory above.

duplicate-files.py

Summary of Changes

Usage

usage: duplicate-files.py [-h] [-q, --quiet] [-v, --verbose]
                          [-o, --output FILE] [-j, --json] [-y, --yaml]
                          [-p, --pretty] [-z, --zero-length]
                          DIR [DIR ...]

Find duplicate files in one or more directories.

positional arguments:
  DIR                a directory to scan

options:
  -h, --help         show this help message and exit
  -q, --quiet        run without printing to standard output.
  -v, --verbose      print extra information to standard output.
  -o, --output FILE  write to the given file (default: standard output)
  -j, --json         write output as JSON
  -y, --yaml         write output as YAML (default)
  -p, --pretty       "pretty-print" the output
  -z, --zero-length  include zero-length files

Listing

#!/usr/bin/env python3

from pathlib import Path
from itertools import combinations

import argparse
import filecmp
import fnmatch
import json

YAML_ENABLED: bool = True

try:
    import yaml
except ImportError:
    YAML_ENABLED = False


STD_EXCLUDES: list[str] = [
    # version control directories
    'CVS', 'CVSROOT', '.git', '.gitignore', '.svn',
    # macosx-specific hidden files
    '.DS_Store', '.Apple*', '._*',
    # trash directories
    'Trash', '.Trash*',
]

def prunable(name: str) -> bool:
    return any(fnmatch.fnmatch(name, pat) for pat in STD_EXCLUDES)

def find_files(sizemap: dict[int, set[Path]], name: str | Path) -> None:
    f: Path
    if isinstance(name, Path):
        f = name
    else:
        f = Path(name)

    if prunable(f.name):
        return

    if f.is_symlink():
        return

    if f.is_dir():
        for child in f.iterdir():
            find_files(sizemap, child)
    elif f.is_file():
        sz: int = f.stat().st_size
        if sz in sizemap:
            sizemap[sz].add(f)
        else:
            sizemap[sz] = set((f,))

def duplicate_files(file_i, file_j) -> bool:
    return file_i != file_j \
        and not file_i.samefile(file_j) \
        and filecmp.cmp(file_i, file_j, shallow=False)

def add_to_dupsets(dupsets: dict[Path, frozenset[Path]], file_i: Path, file_j: Path) -> None:
    if not file_i in dupsets:
        dupsets[file_i] = frozenset((file_i,))
    if not file_j in dupsets:
        dupsets[file_j] = frozenset((file_j,))
    newset = dupsets[file_i] | dupsets[file_j]
    for f in newset:
        dupsets[f] = newset

def compare_files(sizemap: dict[int, set[Path]]) -> list[list[str]]:
    dupsets: dict[Path, frozenset[Path]] = {}
    for sz, ls in sizemap.items():
        if sz == 0:
            continue
        for file_i, file_j in combinations(ls, 2):
            if duplicate_files(file_i, file_j):
                add_to_dupsets(dupsets, file_i, file_j)

    superset: set[frozenset[Path]] = set(dupsets.values())
    result: list[list[str]] = [sorted(str(p) for p in s) for s in superset]
    result.sort()
    return result

def make_argparser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
                    description='Find duplicate files in one or more directories.')
    parser.add_argument('dirs',
                    metavar='DIR',
                    nargs='+',
                    help='a directory to scan')
    #parser.add_argument('-d, --from',
    #                metavar='MAINDIR',
    #                dest='canondir',
    #                help='Compare other directories to this directory.')
    parser.add_argument('-q, --quiet',
                    action='store_true',
                    dest='quiet',
                    help='run without printing to standard output.')
    parser.add_argument('-v, --verbose',
                    action='store_true',
                    dest='verbose',
                    help='print extra information to standard output.')
    parser.add_argument('-o, --output',
                    metavar='FILE',
                    dest='outfile',
                    help='write to the given file (default: standard output)')
    if YAML_ENABLED:
        parser.add_argument('-j, --json',
                        action='store_const',
                        const='json',
                        default='yaml',
                        dest='format',
                        help='write output as JSON')
        parser.add_argument('-y, --yaml',
                        action='store_const',
                        const='yaml',
                        default='yaml',
                        dest='format',
                        help='write output as YAML (default)')
    parser.add_argument('-p, --pretty',
                    action='store_true',
                    dest='pretty',
                    help='"pretty-print" the output')
    parser.add_argument('-z, --zero-length',
                    action='store_true',
                    dest='zero',
                    help='include zero-length files')
    return parser

def main() -> None:
    sizemap: dict[int, set[Path]] = {}
    parser: argparse.ArgumentParser = make_argparser()
    args: argparse.Namespace = parser.parse_args()

    for dirname in args.dirs:
        find_files(sizemap, dirname)

    result: list[list[str]] = compare_files(sizemap)

    if args.zero and 0 in sizemap:
        zero: list[str] = [str(p) for p in sizemap[0]]
        zero.append('')  # so remove-files will remove *all* files
        result.append(zero)

    if not YAML_ENABLED or args.format == 'json':
        if args.pretty:
            indent = 2
        else:
            indent = None
        out = \
            json.dumps(result, sort_keys=True, indent=indent)
    else:
        out = \
            yaml.dump(result,
                    width=4096,
                    explicit_start=True,
                    explicit_end=args.pretty,
                    canonical=args.pretty)

    if not args.outfile:
        print(out)
    else:
        outpath = Path(args.outfile)
        outpath.write_text(out, encoding='utf-8')

if __name__ == '__main__':
    main()