#!/usr/bin/perl # # Copyright © 2015-2020 by Vincent Slyngstad # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS LISTED ABOVE BE LIABLE # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # Except as contained in this notice, the names of the authors above # shall not be used in advertising or otherwise to promote the sale, use # or other dealings in this Software without prior written authorization # from those authors. @rem = ' @echo off c:\perl5\bin\perl %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 goto endofperl @rem ' if @rem; # $recursive = 0; if ($ARGV[0] eq "-r") { shift @ARGV; $recursive = 1; } @todo = @ARGV; @todo = (".") unless @todo; # Experiment 1: Read all of both files, but keep checksums. Two files need # comparison only if one has never been read, or their checksums are equal. # This should be a win if large numbers of files are the same size. # Anecdotally, save 31% of compares on one large run. # Averaged 2.15 compares per file on that same run. $compares = $saved = 0; %sum = (); # # Experiment 2, part 1 of 2: Keep checksums between runs. Two files need # be read only if one has never been read. if (open(INPUT, "dups.db")) { print STDERR "Reading sum database..."; while () { ($f, $s) = /(.*) (.*)/; # TODO: normalize $f here. $f =~ y?\\?/?; $sum{$f} = $s; } print STDERR "done\n"; } # # Compare two files known to be the same size. sub cmp { local($f1, $f2) = @_; local($s, $sum1, $sum2, $result); # Experiment: Two identical files is OK if they # have the same basename. $basename = $f1; $basename =~ s/.*[\\\/]//; $basename =~ s/\[/\\[/g; return 1 if $f2 =~ /[\\\/]$basename$/; $compares += 1; if (defined($sum{$f1}) && defined($sum{$f2})) { $saved += 1 unless $sum{$f1} == $sum{$f2}; return 1 unless $sum{$f1} == $sum{$f2}; } $result = $sum1 = $sum2 = 0; open(F1, $f1) || die "$f1: $!"; binmode(F1); open(F2, $f2) || die "$f2: $!"; binmode(F2); while () { $sum1 += unpack("%32C*", $_); if (($s = )) { $sum2 += unpack("%32C*", $s); next if $s eq $_; } $result = 1; if (defined($sum{$f1})) { # No need to read the rest of $f1. $sum1 = $sum{$f1}; last; } } # Finish off $f2 if it has more data. while (($s = )) { $result = 1; $sum2 += unpack("%32C*", $s); } close(F1); close(F2); $sum{$f1} = $sum1; $sum{$f2} = $sum2; return $result; } open(STDOUT, ">vv.bat") || die "vv.bat: $!"; while (@todo) { $dir = shift @todo; opendir(DIR, $dir) || die "$dir: $!"; print STDERR "Reading '$dir' ..."; foreach (readdir(DIR)) { $f = "$dir/$_"; # TODO: normalize $f here! $f =~ y?\\?/?; next if $_ eq "."; next if $_ eq ".."; next if $_ eq ".git"; next if $_ eq ".svn"; next if $_ eq "backup"; next if $_ eq "Camera"; next if $_ eq "Casio"; next if $_ eq "Tax"; next if /\/Android/; next if /\/.android/; next if /\/TaxCut/; next if /\.csv$/; next if /\.htm$/; next if /\.html$/; next if /\.ini$/; next if /\.lnk$/; next if /\.rdp$/; next if /TaxCut[0-9]/; #BUGBUG: Experimental (April 2021) s/([ ])/\\ /g; if (/(['])/) { print "Skipping $f (imbedded '$1') ...\n"; print STDERR "\nSkipping $f (imbedded '$1') ..."; next; } if (-d $f) { push(@todo, $f) if $recursive; next; } $s = -s $f; if (defined $files{$s}) { $files{$s} .= "'$f"; } else { $files{$s} = $f; } } print STDERR "\n"; closedir(DIR); } print STDERR "Writing vv.bat...\n"; $done = 0; $opercent = -1; sub byname { $a cmp $b } foreach (keys %files) { # Get a list of all files of this size @files = sort byname split(/'/, $files{$_}); %done = (); # reduce memory usage $done++; $npercent = int((100*$done) / (0+keys %files)); if ($npercent != $opercent) { # Printing every time slows down the checking. printf STDERR "%3d%%\r", $npercent; $opercent = $npercent; } for $i (@files) { for $j (@files) { next if $i eq $j; # Two different files if ($i lt $j) { next if $done{$i,$j}; $done{$i,$j} = 1; } else { next if $done{$j,$i}; $done{$j,$i} = 1; } # Two files never compared before next if &cmp($i, $j); # Two identical files next if $done{$i} && $done{$j}; $done{$i} = 1; $done{$j} = 1; $out = "$i $j\n"; $out =~ y:\\:\/:; print $out; } } } print STDERR "\n"; printf STDERR "Saved %d of %d compares on %d files\n", $saved, $compares, (%files/2); # # Experiment 2, part 2 of 2: Keep checksums between runs. Two files need # be read only if one has never been read. if (open(OUTPUT, ">dups.db")) { print STDERR "Writing sum database..."; foreach $f (sort byname keys %sum) { # Save just the stuff in subdirectories next unless $f =~ m:[/\\].*[/\\]:; print OUTPUT "$f $sum{$f}\n" unless $f =~ m?[/\\]windows[/\\]desktop[/\\]?; } print STDERR "done\n"; } __END__ :endofperl