#!/usr/bin/perl
#
# Copyright © 2015-2020 by Vincent Slyngstad
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS LISTED ABOVE BE LIABLE
# FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# Except as contained in this notice, the names of the authors above
# shall not be used in advertising or otherwise to promote the sale, use
# or other dealings in this Software without prior written authorization
# from those authors.
@rem = '
@echo off
c:\perl5\bin\perl %0 %1 %2 %3 %4 %5 %6 %7 %8 %9
goto endofperl
@rem ' if @rem;
#
$recursive = 0;
if ($ARGV[0] eq "-r") {
shift @ARGV;
$recursive = 1;
}
@todo = @ARGV;
@todo = (".") unless @todo;
# Experiment 1: Read all of both files, but keep checksums. Two files need
# comparison only if one has never been read, or their checksums are equal.
# This should be a win if large numbers of files are the same size.
# Anecdotally, save 31% of compares on one large run.
# Averaged 2.15 compares per file on that same run.
$compares = $saved = 0;
%sum = ();
#
# Experiment 2, part 1 of 2: Keep checksums between runs. Two files need
# be read only if one has never been read.
if (open(INPUT, "dups.db")) {
print STDERR "Reading sum database...";
while () {
($f, $s) = /(.*) (.*)/;
# TODO: normalize $f here.
$f =~ y?\\?/?;
$sum{$f} = $s;
}
print STDERR "done\n";
}
#
# Compare two files known to be the same size.
sub cmp {
local($f1, $f2) = @_;
local($s, $sum1, $sum2, $result);
# Experiment: Two identical files is OK if they
# have the same basename.
$basename = $f1; $basename =~ s/.*[\\\/]//;
$basename =~ s/\[/\\[/g;
return 1 if $f2 =~ /[\\\/]$basename$/;
$compares += 1;
if (defined($sum{$f1}) && defined($sum{$f2})) {
$saved += 1 unless $sum{$f1} == $sum{$f2};
return 1 unless $sum{$f1} == $sum{$f2};
}
$result = $sum1 = $sum2 = 0;
open(F1, $f1) || die "$f1: $!";
binmode(F1);
open(F2, $f2) || die "$f2: $!";
binmode(F2);
while () {
$sum1 += unpack("%32C*", $_);
if (($s = )) {
$sum2 += unpack("%32C*", $s);
next if $s eq $_;
}
$result = 1;
if (defined($sum{$f1})) {
# No need to read the rest of $f1.
$sum1 = $sum{$f1};
last;
}
}
# Finish off $f2 if it has more data.
while (($s = )) {
$result = 1;
$sum2 += unpack("%32C*", $s);
}
close(F1);
close(F2);
$sum{$f1} = $sum1;
$sum{$f2} = $sum2;
return $result;
}
open(STDOUT, ">vv.bat") || die "vv.bat: $!";
while (@todo) {
$dir = shift @todo;
opendir(DIR, $dir) || die "$dir: $!";
print STDERR "Reading '$dir' ...";
foreach (readdir(DIR)) {
$f = "$dir/$_";
# TODO: normalize $f here!
$f =~ y?\\?/?;
next if $_ eq ".";
next if $_ eq "..";
next if $_ eq ".git";
next if $_ eq ".svn";
next if $_ eq "backup";
next if $_ eq "Camera";
next if $_ eq "Casio";
next if $_ eq "Tax";
next if /\/Android/;
next if /\/.android/;
next if /\/TaxCut/;
next if /\.csv$/;
next if /\.htm$/;
next if /\.html$/;
next if /\.ini$/;
next if /\.lnk$/;
next if /\.rdp$/;
next if /TaxCut[0-9]/;
#BUGBUG: Experimental (April 2021)
s/([ ])/\\ /g;
if (/(['])/) {
print "Skipping $f (imbedded '$1') ...\n";
print STDERR "\nSkipping $f (imbedded '$1') ...";
next;
}
if (-d $f) {
push(@todo, $f) if $recursive;
next;
}
$s = -s $f;
if (defined $files{$s}) {
$files{$s} .= "'$f";
} else {
$files{$s} = $f;
}
}
print STDERR "\n";
closedir(DIR);
}
print STDERR "Writing vv.bat...\n";
$done = 0;
$opercent = -1;
sub byname { $a cmp $b }
foreach (keys %files) {
# Get a list of all files of this size
@files = sort byname split(/'/, $files{$_});
%done = (); # reduce memory usage
$done++;
$npercent = int((100*$done) / (0+keys %files));
if ($npercent != $opercent) {
# Printing every time slows down the checking.
printf STDERR "%3d%%\r", $npercent;
$opercent = $npercent;
}
for $i (@files) {
for $j (@files) {
next if $i eq $j;
# Two different files
if ($i lt $j) {
next if $done{$i,$j};
$done{$i,$j} = 1;
} else {
next if $done{$j,$i};
$done{$j,$i} = 1;
}
# Two files never compared before
next if &cmp($i, $j);
# Two identical files
next if $done{$i} && $done{$j};
$done{$i} = 1;
$done{$j} = 1;
$out = "$i $j\n";
$out =~ y:\\:\/:;
print $out;
}
}
}
print STDERR "\n";
printf STDERR "Saved %d of %d compares on %d files\n",
$saved, $compares, (%files/2);
#
# Experiment 2, part 2 of 2: Keep checksums between runs. Two files need
# be read only if one has never been read.
if (open(OUTPUT, ">dups.db")) {
print STDERR "Writing sum database...";
foreach $f (sort byname keys %sum) {
# Save just the stuff in subdirectories
next unless $f =~ m:[/\\].*[/\\]:;
print OUTPUT "$f $sum{$f}\n"
unless $f =~ m?[/\\]windows[/\\]desktop[/\\]?;
}
print STDERR "done\n";
}
__END__
:endofperl