#!/usr/bin/perl # # Copyright (C) Pedro Larroy Tovar piotr%NOSPAMlarroy.com # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # use File::Find; use Cwd; use Digest::MD5; use Getopt::Long; use DB_File; use Data::Dumper; use warnings; use strict; #use diagnostics; use Storable; # # @files is an array of hashes. Each hash holds: # name: the full name of the file, relative to the cwd. # dir: directory of the file relative to the cwd. # size: in sumbytes. # md5: md5sum of the file. # # # Array @eq # --- # | ( {hash} , {hash}, {hash} ) <- arrays of duplicate files # | ( {hash} , {hash} ) <- # . # . # . # | # | ( {hash} , {hash}, {hash} , {hash}, {hash} ) <- # | | # --- # # # Global variables my (@files,@eq,@del); my @progress = ("-","\\","|","/"); my ($help,$verbose,$nodup,$hardlink,$symlink,$prompt,$quiet,$converge); my ($sumbytes,$numfiles,$dupbytes,$dupfiles,$multiple, $totalfiles); my ($time_i,$time_f,$time); $dupfiles=0; $multiple=0; $sumbytes=0; $numfiles=0; $dupbytes=0; $totalfiles=0; GetOptions ( 'help' => \$help, 'verbose' => \$verbose, 'nodup' => \$nodup, 'hardlink' => \$hardlink, 'symlink' => \$symlink, 'prompt' => \$prompt, 'quiet' => \$quiet, 'converge=s' => \$converge ) or usage(); #tie @files, 'DB_File', "files.db", O_CREAT|O_RDWR, 0666 # or die "Can't open database: $!"; sub usage { die "usage: $0 [OPTION] DIRECTORIES TO SEARCH\n \t--verbose \tPrint duplicate files grouped.\n \t--quiet \tgo figure\n \t--symlinlk\tMake soft inks of duplicate files, removing all the duplicates\n \t--hardlink \tMake hard links of duplicate files USE WITH CAUTION! \t \tmodifying one of the files after linking will modify all, \t \tsince they are all hardlinks. If the duplicates are in different \t \tfilesystems they can't be hardlinked)\n \t--converge=[DIR]\t(very useful) cleans duplicates from all directories except DIR, so the others have unique files. Duplicates inside DIR itself aren't cleared to keep DIR integrity.\n \t--nodup \t(DANGEROUS) Delete duplicate files (USE WITH EXTREME CAUTION, HAVE BACKUPS)\n\n Notes:\tDoes not follow symlinks while traversing directories. Symlinks are created to absolute paths."; } sub count { $totalfiles++ if ( -f $_ ); print "\r"; print $progress[($totalfiles+1)%4]; } sub found { my ($fname,$digest,$FILE); my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks); my $md5 = Digest::MD5->new; $fname = $_; if ( -f $fname && ! -l $fname ) { open(FILE,"<",$fname) or die "Can't open '$fname': $!"; binmode(FILE); $md5->reset; $digest = $md5->addfile(*FILE)->hexdigest; ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks) = stat FILE; close(FILE); push @files,{ name => $File::Find::name, dir=> $File::Find::dir, size => $size, md5 => $digest }; $sumbytes += $size; print "$size $File::Find::name $digest\n" if ( $verbose ); $numfiles++; printf "\r%25s\r",""; printf "Progress [%d/%d] %d%%",$numfiles,$totalfiles,($numfiles*100)/$totalfiles; } } sub read_and_sum { print "Reading and md5summing files, this could take a long time\n" if ( ! $quiet ); # if( -f "files.store" ) { # my $ref = retrieve('files.store') or die "retrieve: $!"; # @files = @$ref; # } else { $time_i = time; if ( @ARGV == 0 ) { usage(); exit 1; } else { foreach my $arg (@ARGV) { if ( ! -d $arg ) { die "$0: $arg must be a directory"; } } # This will populate @files array find(\&count,@ARGV); find(\&found,@ARGV); } $time_f = time; $time = $time_f - $time_i; print "\n" if ( ! $quiet ); store \@files, 'files.store'; # } } sub find_duplicates { @files = sort { $b->{size} <=> $a->{size} || $b->{md5} cmp $a->{md5} } @files; my ($j, $dupes, $size, $md5); $dupes=0; $j=0; $size = $files[0]->{size}; $md5 = $files[0]->{md5}; print "Searching for duplicates... " unless ( $quiet ); for( my $i=1 ; $i < @files; $i++ ) { if ( ($md5 eq $files[$i]->{md5}) && ($size eq $files[$i]->{size}) ) { #This is a duplicate of the previous file. # j is the index of the array of files that are the same: @eq if ( ! $dupes ) { #The previous file is the same as the current. #An array is created to hold all the files that are the same push @eq, []; #The previous array is populated with hashes, each one represents a file push @{$eq[$j]}, $files[$i-1]; $multiple++; } push @{$eq[$j]}, $files[$i]; $dupbytes += $files[$i]->{size}; $dupfiles++; $dupes = 1; } else { $size = $files[$i]->{size}; $md5 = $files[$i]->{md5}; $j++ if ( $dupes == 1 ); $dupes = 0; } } if ( ! $quiet ) { foreach my $l (@eq) { print "Duplicates:\n"; foreach my $m (@$l) { print " * symbolic link:" if ( -l $m->{name} ); print " $m->{name} $m->{size} $m->{md5} \n"; } print "\n"; } } printf "done. %d multiple files. %d redundant files. %d Kb are redundant\n",$dupfiles+$multiple,$dupfiles,($dupbytes/1024) unless ( $quiet ); # printf "Total: %d files. Average md5summing speed: %.3f MB/s\n\n",$numfiles,($sumbytes/($time*1024*1024)) unless ( $time == 0 || $quiet ); } sub symlink_dupes { my $cwd = getcwd(); $cwd .="/"; print "Symlinking files... " unless ($quiet); foreach my $l (@eq) { for( my $m = 1; $m < @$l; $m++) { my ($src,$dst); $src = $$l[0]->{name}; $src =~ s#^\./##; $src = quotemeta($cwd.$src); $dst = $$l[$m]->{name}; $dst =~ s#^\./##; $dst = quotemeta($cwd.$dst); unlink $$l[$m]->{name} or die "Cannot unlink $$l[$m]->{name}"; print "symlinking $$l[$m]->{name} to $$l[0]->{name}\n" if ($verbose); qx/ln -s -- $src $dst/; $? == 0 or die "Something wicked happened while trying to symlink $$l[0]->{name} and $$l[$m]->{name}" } } print "done\n" unless ($quiet); } sub hardlink_dupes { print "Hardlinking files... " unless ($quiet); foreach my $l (@eq) { for( my $m = 1; $m < @$l; $m++) { my ($src,$dst); $src = $$l[0]->{name}; #$src = quotemeta($src); $dst = $$l[$m]->{name}; #$dst = quotemeta($dst); unlink $$l[$m]->{name} or die "Cannot unlink $$l[$m]->{name}"; print "hardlinking $$l[$m]->{name} to $$l[0]->{name}\n" if ($verbose); link($src,$dst) or die "link $src $dst: $!\n"; } } print "done\n" unless ($quiet); } sub converge_dupes { my $m = []; print "$converge\n"; EQ: foreach my $l (@eq) { DUP: foreach my $dup (@$l) { if ( $dup->{name} =~ m#(\./)?\Q$converge\E# ) { @$m = grep { $_ if $_->{name} !~ m#(\./)?\Q$converge\E# } @$l; push @del,$m; undef $m; next EQ; } } } foreach my $l (@del) { foreach my $dup (@$l) { unlink $dup->{name} or die "unlink $$dup->{name}: $!\n"; print "unlink ".$dup->{name}."\n"; } } } sub nodup { print "Removing duplicates... " unless ($quiet); foreach my $l (@eq) { for( my $m = 1; $m < @$l; $m++) { # # We keep $$l[0] copy. # unlink $$l[$m]->{name} or die "unlink $$l[$m]->{name}: $!"; print "unlink\t$$l[$m]->{name}\n" unless $quiet; } } print "done\n" unless ($quiet); } ########################################################### # main program ########################################################### # # Make stdout unbuffered # my $old_fh = select(STDOUT); $| = 1; select($old_fh); if ( $help ) { usage(); } read_and_sum(); find_duplicates(); symlink_dupes() if ( $symlink ); hardlink_dupes() if ( $hardlink ); nodup() if ( $nodup ); converge_dupes() if ( $converge );