#!/usr/bin/perl # # deduplicate # # AUTHOR: # Dan Harkless # # COPYRIGHT: # This file is Copyright (C) 2019 by Dan Harkless, and is released under the # GNU General Public License . # # USAGE: # % deduplicate [{-d|-h|-m|-s}] [-y] [...] # # DESCRIPTION: # Traverses one or more file hierarchies, identifying duplicate regular files # (solely) by SHA-3 hash. Optionally, duplicate files can be deleted, or # hard-linked or symlinked to the master copy (the first copy encountered in # the tree). Symlinks are ignored in this process, but currently, # files that are already hard-linked to each other will not be ignored, and # will thus be reported as duplicates. # # COMMANDLINE OPTIONS: # -d # Prompt user whether to delete duplicate files. (Affirmative answers are # those starting with 'y' or 'Y'.) # # -h # Prompt user whether to hard-link duplicate files to the master copy. # # -m # Like -d, except that the user is prompted to delete the master copy # instead of the duplicate copy/copies. # # -s # Prompt user whether to symlink duplicate files to the master copy. Note # that the symlinks will point to absolute paths, not relative ones. # # -y # Assume "yes" rather than prompting the user whether to do the deleting or # linking. # # TODO: # Look into a platform-agnostic way to identify files already hard-linked to # each other. # # DATE MODIFICATION # ========== ================================================================== # 2019-01-27 Original. ## Modules used ################################################################ use Cwd qw(abs_path); use Digest::SHA3; use English qw(-no_match_vars); # allow use of names like @ARG rather than @_ use File::Basename; # for basename() and dirname() use File::Find; # for find() use Getopt::Std; # for getopts() use warnings; # get warnings for this script but not modules ## Subroutines ################################################################# sub alphasort_dir { return sort(@ARG); } sub compute_hashes { my $arg = $File::Find::name; if (-f $arg and not -l $arg) { $sha3->addfile($arg); my $hash = $sha3->hexdigest(); if (($hashing_master == 1 and not defined($hashes{$hash}[0])) or (defined($hashes{$hash}[0]) and $hashes{$hash}[0] ne $arg)) { print "\n$arg:\n$hash\n"; push(@{$hashes{$hash}}, $arg); } } } sub my_die { print STDERR "@ARG. Aborting.\n"; exit 1; } ## Main ######################################################################## %find_options = (no_chdir => 1, preprocess => \&alphasort_dir, wanted => \&compute_hashes); $opt_d = 0; $opt_h = 0; $opt_m = 0; $opt_s = 0; $opt_y = 0; $progname = basename($PROGRAM_NAME); $sha3 = Digest::SHA3->new(512); if (not getopts("dhmsy") or ($opt_d + $opt_h + $opt_m + $opt_s > 1) or scalar(@ARGV) < 1) { print STDERR "Usage: $progname [{-d|-h|-m|-s}] [-y] [...]\n"; exit(1); } $master = abs_path(shift()); if (-d $master) { print "Master dir: $master\n"; } elsif (-e $master) { print "Master file: $master\n"; } else { my_die("$master: Nonexistent master dir/file"); } $hashing_master = 1; find(\%find_options, $master); $hashing_master = 0; foreach $dup (@ARGV) { print "\n"; $dup = abs_path($dup); if (-d $dup) { print "Duplicate-check dir: $dup\n"; } elsif (-e $dup) { print "Duplicate-check file: $dup\n"; } else { my_die("$dup: Nonexistent duplicate-check dir/file"); } find(\%find_options, $dup); } print "\nDuplicates:\n"; foreach $hash (sort(keys(%hashes))) { if (scalar(@{$hashes{$hash}}) > 1) { print "\n$hash:\n"; for ($i = 0; $i < scalar(@{$hashes{$hash}}); $i++) { $file = $hashes{$hash}[$i]; print "$file: "; if ($i == 0) { print "Master"; $master = $file; } else { print "Duplicate"; } if (($i == 0 and $opt_m) or ($i != 0 and ($opt_d or $opt_h or $opt_s))) { print "; "; if ($opt_y) { if ($opt_d or $opt_m) { print "deleting.\n"; } elsif ($opt_h) { print "hardlinking to master.\n"; } elsif ($opt_s) { print "symlinking to master.\n"; } $dedup = "y"; } else { if ($opt_d or $opt_m) { print "delete? "; } elsif ($opt_h) { print "hardlink to master? "; } elsif ($opt_s) { print "symlink to master? "; } $dedup = ; } if ($dedup =~ /^y/i) { unlink($file) or my_die("$file: $OS_ERROR"); if ($opt_h) { link($master, $file) or my_die("$file: $OS_ERROR"); } elsif ($opt_s) { symlink($master, $file) or my_die("$file: $OS_ERROR"); } } } else { print ".\n"; } } } }