#! /usr/bin/perl -w

# NAME: cleanemail.perl - Remove deleted or duplicate messages from POP
# mailboxes.
#
# Author: David Booth 4-Oct-2002.  Improved 9-Jan-2006, 11-Mar-2009
#         and 24-Nov-2009.  For contact info see: http://dbooth.org/
#
# License: GNU GPL: http://www.opensource.org/licenses/gpl-license.php
#
# INTRODUCTION:
# This program removes duplicate or previously deleted messages 
# from POP mailbox files, permanently deleting them.
# It is especially useful for cases in which your email client loses
# track of which messages it has downloaded from your mail server,
# and downloads all messages again, thus creating hundreds or thousands
# of duplicate messages in your Inbox.
#
# This program works for Evolution and previously worked for Eudora
# (and hopefully still does, though I haven't tested it for quite a while).  
# It should also work for other mail clients that use the mbox file 
# format to store messages, though it may not know to delete associated 
# index files.  Please let me know your experience if you try it with 
# other email clients.
#
# QUICK START FOR EVOLUTION USERS:
# 1. In Evolution, create a top level mail folder called "Trash-emptied" 
# (if you don't already have one) and copy the entire contents of 
# your Trash folder to the Trash-emptied folder.  This will allow 
# previously deleted messages to be detected and re-deleted from your 
# Inbox and other folders.  It doesn't matter if there are duplicate 
# messages in this folder.
# 2. Exit from Evolution!
# 3. cd ~/.evolution/mail/local
# 4. Run: perl cleanemail.perl *
# Or, if you use sub-folders, run: perl cleanemail.perl * *.sbd/*
#
# QUICK START FOR EUDORA USERS:
# 1. In Eudora, you should 
#    "Special->Compact Mailboxes".  This normalizes the *.mbx files.
#    Otherwise, the *.toc files seem to indicate which messages have
#    been logically but not physically moved from the *.mbx files.
# 2. Exit from Eudora!
# 3. cd your-email-directory
# where your-email-directory is your email directory.
# 4. Run: perl cleanemail.perl *.mbx
#
# OVERVIEW:
# This program deletes duplicate or previously moved or trashed
# messages from mailboxes that are stored in mbox format:
# http://en.wikipedia.org/wiki/Mbox
# http://www.qmail.org/man/man5/mbox.html
# Messages deleted by this program are REALLY deleted, i.e.,
# they are NOT just moved to a Trash folder.
#
# Inbox(s) are processed slightly differently from non-inbox files.
# Since users often move messages from the Inbox to other folders
# (including the trash), any messages that appear in 
# any non-inbox file will be deleted from the Inbox file(s).  
# Furthermore, duplicate messages in any file will also be deleted.
# 
# WARNING: There is a danger to this algorithm.  Suppose you receive
# multiple copies of a message in your Inbox, you manually moving 
# one of the duplicates to the trash, and then you run this
# program.  In this case, this program will detect the message in
# your trash and delete all copies of it from your Inbox, under
# the assumption that you intended to trash all copies, leaving
# only the single copy of the message in your trash.
# To avoid this problem, you should not manually delete ANY messages 
# when you receive duplicates.  Instead, run this program to delete
# the duplicates.
#
# Files are modified in place.  However, for safety
# all work is initially performed on temporary files.  After a file
# has been processed, it is renamed to *.cleanemail.bak
# (as a backup) and then the temp file is renamed to take
# the place of the original file.  If you want the .bak files
# to be automatically deleted afterward, use the -db or
# -deleteBackups option.
#
# Certain files get special treatment:
#	Out.mbx		-- Ignored.  Not processed at all. 
#	Sent.mbx	-- Ignored.  Not processed at all. 
#	Drafts.mbx	-- Ignored.  Not processed at all. 
# Default trash files (for special treatment as trash):
#	Trash.mbx	-- Processed as a trash file.
#	Trash-*.mbx	-- Processed as a trash file.
#	SPAMn.mbx	-- (Where n is a number) Processed as a trash file.
#	_SPAMn.mbx	-- (Where n is a number) Processed as a trash file.
#	_SuspectedSPAM.mbx	-- Processed as a trash file.

# Extensions of evolution and other files that are also ignored:
my @extensionsToIgnore = qw(
	ev-summary-meta
	cmeta
	ev-summary
	index
	data
	temp
	bak
	db
	sbd
	);
my $extensionsToIgnorePattern = join("|", map {quotemeta($_)} @extensionsToIgnore);

############### Maintenance Note ##################
# Earlier versions of this program distinguised "trash" files from
# regular mailbox files, with the idea that if a message appeared in
# a trash file, it should be deleted from the regular mailbox files
# under the assumption that the user had previously deleted it.
# However, I later realized that instead the program should distinguish
# the *Inbox* from other mailbox files, and if a message appears in
# any of the other mailbox files (including trash files) it should be
# deleted from the Inbox under the assumption that the user had previously
# moved it *from* the Inbox to that folder (either because of deleting
# it or just for organizational purposes).  Therefore, although the code
# below refers to "trashFiles", these are now any regular mail files
# *except* for the Inbox.
###################################################

#
# A note about carriage returns (ctl-M, or "CR")
# ==============================================
# MS Windows machines seem to automatically strip CR off the end
# of a line on reading, and add a CR when writing.
# This caused me a lot of grief in debugging this program,
# because the number of bytes read/written were not matching the file sizes!
# To make things work right under W2K, by default I delete CRs
# at the end of a line on input.  (This is controlled by the global
# $rmCR flag -- see &ReadLine.)  
# Since I delete any number of MULTIPLE CRs at the 
# end of a line (not just one), this means that the resulting messages may be 
# slightly DIFFERENT from the original messages.
# In the 5+ years of using this script I have not seen this cause a problem, 
# but it seems plausible to
# me that it could for some data.  I'm afraid I don't know the
# rules for email message syntax well enough to know whether it could.

########################## Main #############################
my $thisProgram = "cleanemail";

# What pattern indicates the start of a new message?
#   Here are some normal ones:
# From csail-related-bounces@lists.csail.mit.edu Wed Sep  8 18:34:17 2004
# From ???@??? Mon Sep 23 11:23:37 2002
#   Here are some weird ones (probably from spammers):
# From [?var=TAGMAILFROM] Wed Jan 21 12:43:03 2009
# From \Annette Gorman" <CathyhalmaHickey@wikipedia.org> Mon Sep 15 21:49:32 2008
# From _almansour@acee.net Thu Mar 20 10:00:03 2008
# From a%CUST_WORD@disnet1.org Wed Dec 24 23:42:24 2008
# From a-manterola@ej-gv.es Mon Sep  8 16:06:49 2008
# From de'an@4ur.com Sat Jan  5 15:29:20 2008
# From et{ist{_1995@dudapropaganda.com.br Tue Oct 28 21:04:57 2008
# From manager#05@wachovia.com Thu Dec 11 16:51:34 2008
# my $messageStartPattern = "\\AFrom \\s*[^\\@]*\\@[^\\@]*.*\\d+\\:\\d+\\:\\d+\\s+\\d+\\s*\\Z";
# Don't require an @ because some spam doesn't use it:
my $messageStartPattern = "\\AFrom .*\\d+\\:\\d+\\:\\d+\\s+\\d+\\s*\\Z";
my $maxMessageStartLength = 400;	# Anything longer than this may indicate an error.
# warn "maxMessageStartLength: $maxMessageStartLength messageStartPattern: $messageStartPattern\n";

my $progressCharsPerLine = 50;	# Set to 0 if progress indicator is not wanted.
my $savedChar = ".";		# Indicates a message retained
my $deletedChar = "X";		# Indicates a trashed message deleted
my $duplicateChar = "x";	# Indicates a duplicate message deleted
my $nMessagesPerSummary = 100;	# Set to 0 for no progress summary.
$progressCharsPerLine = 0 if $nMessagesPerSummary;	# Override?

# In comparing messages to determine duplicates, these headers will
# be compared and all other headers will be ignored.  (A checksum of
# the message body will always be compared also.  See the &Hash() 
# function below.)  # Do not confuse the "From" header with the "From:'
# header.  The former has a time-stamp in it that we do NOT
# want to compare.
my @headersWanted = qw( Return-Path: Date: From: Subject: Sender: To: );
my $headersWantedPattern = join("|", map {quotemeta($_)} @headersWanted);
# warn "headersWantedPattern: $headersWantedPattern\n";

# Get options and look for trash files
my $force = 0;		# -f	-- Force. (Clobber existing .temp files)
my $debug = 0;		# -debug	-- Keep .temp files.
my $noop = 0;		# -noop	-- No operation.  Show what would be done,
			# 	   but do not save the results.
my $verbose = 0;	# -v	-- Show lots of info.
my $rmCR = 1;		# -cr	-- Remove carriage returns at ends of lines.
my $rmTOC = 1;		# -toc	-- Remove the .toc files (but not Out.toc)
my @tocExtensions = qw( toc ev-summary ); # Extensions of files to remove
			# 	   if $rmTOC is set.
my $deleteTrashed = 1;	# -deleteTrashed
			#	-- Delete previously trashed or moved messages.
			#	   This causes messages found in trash folders
			#	   to be deleted from non-trash folders.
my $deleteBackups = 0;	# -deleteBackups

my $debugHash = 0;
my $trashPattern = "(Trash(.*)|(_?)((Suspected)?)SPAM([\-\_]\\d*)";
my $outPattern = "(Out|Outbox|Sent|Drafts|Templates)";
my $inPattern = "(In|Inbox)";
my @trashFiles = ();
my @inFiles = ();
my @args = ();
my @ignoredFiles = ();
while(@ARGV)
	{
	my $arg = shift @ARGV;
	if (0) {}
	elsif ($arg eq "-maxMessageStartLength") { $maxMessageStartLength = shift @ARGV; }
	elsif ($arg eq "-v") { $verbose = 1; }
	elsif ($arg eq "-n") { $noop = 1; }
	elsif ($arg eq "-noop") { $noop = 1; }
	elsif ($arg eq "-cr") { $rmCR = 1; }
	elsif ($arg eq "-f") { $force = 1; }
	elsif ($arg eq "-toc") { $rmTOC = 1; }
	elsif ($arg eq "-debug") { $debug = 1; }
	elsif ($arg eq "-deleteBackups") { $deleteBackups = 1; }
	elsif ($arg eq "-db") { $deleteBackups = 1; }
	elsif ($arg eq "-deleteTrashed") { $deleteTrashed = 1; }
	elsif ($arg eq "-dt") { $deleteTrashed = 1; }
	elsif ($arg eq "-i") { push(@inFiles, shift @ARGV); }
	elsif ($arg eq "-t") { push(@trashFiles, shift @ARGV); }
	elsif (-d $arg) 
		{ push(@ignoredFiles, $arg); }
	elsif ($arg =~ m/\.($extensionsToIgnorePattern)\Z/i) 
		{ push(@ignoredFiles, $arg); }
	elsif ($arg =~ m/\A(.*[\/\\])?$outPattern\Z/i)
		{ push(@ignoredFiles, $arg); }
	##### No longer check for trashPattern, because all files except
	##### Inbox will now be treated as trash files:
	# elsif ($arg =~ m/\A(.*[\/\\])?$trashPattern\Z/i) { push(@trashFiles, $arg); }
	elsif ($arg =~ m/\A(.*[\/\\])?$inPattern\Z/i) { push(@inFiles, $arg); }
	elsif ($arg =~ m/\A\-/) { warn "Unknown option: $arg\n"; &DieUsage(); }
	else { push(@args, $arg); }
	}
my $nIgnored = scalar(@ignoredFiles);	# Number of files ignored
@ARGV = @args;
die "ERROR: Bad maxMessageStartLength\n" if $maxMessageStartLength <= 0;
@ARGV || @inFiles || @trashFiles || $nIgnored || &DieUsage();

##### These lines convert from the old approach of treating everything
##### as Inboxes except trash files, to the new approach of treating
##### everything *except* the Inboxes as trash files:
my %inFiles = map {($_,$_)} @inFiles;
my %trashFiles = map {($_,$_)} @trashFiles;
my %otherFiles = map {($_,$_)} @ARGV;
# Force trashFiles to not be considered inFiles:
foreach my $f (keys %trashFiles) 
	{ 
	delete($inFiles{$f}) if exists($inFiles{$f}); 
	delete($otherFiles{$f}) if exists($otherFiles{$f}); 
	}
# Force inFiles to not be considered trashFiles:
foreach my $f (keys %inFiles) 
	{ 
	delete($trashFiles{$f}) if exists($trashFiles{$f}); 
	delete($otherFiles{$f}) if exists($otherFiles{$f}); 
	}
# Treat other files as trash files by default:
%trashFiles = map{($_,$_)} (keys %otherFiles, keys %trashFiles);
@trashFiles = sort keys %trashFiles;
@ARGV = sort keys %inFiles;

if (@trashFiles && !$deleteTrashed)
	{
	warn "Non-inbox files found: \n\t",join("\n\t", @trashFiles), "\n";
	warn "\nNOTE: Non-inbox files will be treated as Inbox files\n";
	warn "because the -dt option was NOT specified.  If desired, you\n";
	warn "can use the -dt option to cause all messages found in any\n";
	warn "trash file to be deleted from the regular mailbox files.\n\n";
	@ARGV = (@trashFiles, @ARGV);
	@trashFiles = ();
	}

warn "Files ignored: \n\t", join("\n\t", @ignoredFiles), "\n" if $nIgnored;
warn "Non-inbox files to process: \n\t", join("\n\t", @trashFiles), "\n";
warn "(Messages in non-inbox files will be deleted from Inbox files.)\n"
	if $deleteTrashed;
warn "Inbox files to process: \n\t", join("\n\t", @ARGV), "\n";

################### DieUsage ###################
sub DieUsage
{
die "Usage: cleanemail.perl [options ...] file1.mbox ...
Common options:
	-noop
		Do nothing, but show what would be done.
	-i inboxFile
		Process inboxFile as an Inbox even if it would
		not be recognized as an Inbox (based on its name).
	-t trashFile
		Process trashFile as a trash file or other non-inbox file
		even if it appears to be an Inbox (based on its name).
	-db
		Delete backup files when finished.

Other options not usually needed:
	-debug
		Show debugging output
	-cr
		Remove carriage returns (Default)
	-f
		Force overwrite of existing temp files
	-toc
		Remove corresponding *.toc and *.ev-summary files (Default)
	-dt
		Delete trash (previously deleted mail) from inboxes. (Default)
	-maxMessageStartLength	mmsl
		Set \$maxMessageStartLength to mmsl (Default is $maxMessageStartLength)
";
}


# As a precaution, look for any potential email client lock files.
# We don't want to be running this while the email client is running.
foreach my $trashFile (@trashFiles, @ARGV)
	{
	foreach my $f ("OWNER.LOK")
		{
		my $lockFile = $trashFile;
		$lockFile =~ s/[^\\\/]+\Z//;	# Chop off tail
		$lockFile .= $f;
		die "DANGER!  I found a possible lock file: $lockFile\n You must exit your email client before running this program!\n"
			if -e $lockFile;
		}
	}

my ($tnMsgs, $tnDupes, $tnTrashed, $tnOutput) = (0, 0, 0, 0); # Totals

# Process trash files first, so that we know what was
# previously deleted.  
my %trashHashes = ();	# Hashes of all known trash msgs
foreach my $trashFile (@trashFiles)
	{
	print STDERR "Processing non-inbox: $trashFile\n";
	my ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %newHashes) =
		&ProcessMailbox($trashFile, ());
	die if $nTrashed != 0;
	if ($nMessagesPerSummary == 0)
		{
		warn "\n";
		warn "  Total messages: $nMsgs\n";
		# warn "  Duplicates removed ($duplicateChar): $nDupes\n";
		warn "  Duplicates removed: $nDupes\n";
		warn "  Messages output: $nOutput\n";
		}
	$tnMsgs += $nMsgs;
	$tnDupes += $nDupes;
	$tnTrashed += $nTrashed;
	$tnOutput += $nOutput;
	&DeleteOrSaveBackup($bakFile, $nDupes+$nTrashed);
	%trashHashes = (%trashHashes, %newHashes);
	}

# Process all Inbox mailboxes
%trashFiles = map {($_,$_)} @trashFiles;
foreach my $mailFile (@ARGV)
	{
	if (exists($trashFiles{$mailFile}))
		{
		warn "Skipping non-inbox file already processed: $mailFile\n";
		next;
		}
	warn "Processing inbox: $mailFile\n";
	my ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %msgHashes) =
		&ProcessMailbox($mailFile, %trashHashes);
	if ($nMessagesPerSummary == 0)
		{
		warn "\n";
		warn "  Messages found: $nMsgs\n";
		# warn "  Duplicates deleted ($duplicateChar): $nDupes\n";
		# warn "  Previously moved messages deleted ($deletedChar): $nTrashed\n";
		warn "  Duplicates deleted: $nDupes\n";
		warn "  Previously moved messages deleted: $nTrashed\n";
		warn "  Messages output: $nOutput\n";
		}
	$tnMsgs += $nMsgs;
	$tnDupes += $nDupes;
	$tnTrashed += $nTrashed;
	$tnOutput += $nOutput;
	&DeleteOrSaveBackup($bakFile, $nDupes+$nTrashed);
	}

if (@ARGV + @trashFiles > 1)
	{
	warn "\n";
	warn "========= Grand Totals =========\n";
	warn "Total messages found: $tnMsgs\n";
	# warn "Total duplicates deleted ($duplicateChar): $tnDupes\n";
	# warn "Total previously moved messages deleted ($deletedChar): $tnTrashed\n";
	warn "Total duplicates deleted: $tnDupes\n";
	warn "Total previously moved messages deleted: $tnTrashed\n";
	warn "Total Messages output: $tnOutput\n";
	}

exit 0;


########### DeleteOrSaveBackup #############
sub DeleteOrSaveBackup
{
@_ == 2 || die;
my ($bakFile, $nDupesOrTrashed) = @_;
if ($nDupesOrTrashed == 0)
	{
	warn "  Original file was unchanged -- no need for a backup file.\n\n";
	}
elsif ($deleteBackups)
	{
	warn "  Could not delete backup file: $bakFile\n\n"
		if !unlink($bakFile);
	}
else	{
	warn "  Original file saved in: $bakFile\n\n";
	}
}

############### ReadMessage ###############
# Read and return the next message from the given $mailFile.
# Return the empty string "" if EOF.
sub ReadMessage
{
@_ == 1 || die;
my ($mailFile) = @_;
my $msg = &ReadLine($mailFile);
return("") if !defined($msg);
my $bytesRead = length($msg);
# First line should match messageStartPattern:
if ($msg !~ m/$messageStartPattern/) 
	{
	die "ERROR: Unrecognized message start line: $msg\nDoes not match messageStartPattern: $messageStartPattern\n";
	}
# Read lines until we reach the start of the next message.
while (defined($line = &ReadLine($mailFile)))
	{
	$bytesRead += length($line);
	# See if it's the start of a new message.
	# From ???@??? Mon Sep 23 11:23:37 2002
	if ($line =~ m/$messageStartPattern/ && $msg)
		{
		# warn "MessageStart: $line";
		if (length($line) > $maxMessageStartLength)
			{
			die "ERROR: Possibly corrupt mailbox.  Message start line is \nlonger than $maxMessageStartLength .\nMessage start line was: $line\nUse -maxMessageStartLength n option to specify a \nlarger max if this start line looks okay.\n";
			}
		# Push back this line and return what we have so far.
		&UnReadLine($mailFile, $line);
		$bytesRead -= length($line);
		last;
		}
	$msg .= $line;
	$bytesRead == length($msg) || die;
	}
return($msg);
}

############## ProcessMailbox ################
# Given a mailFile and (possibly) some hashes of trashed messages,
# rewrite the given mailFile.
# Return:
#	backup filename
#	number of messages read
#	number of duplicates removed
#	number of previously trashed messages removed
#	hashes of remaining messages
sub ProcessMailbox
{
@_ >= 1 || die;
my ($mailFile, %trashHashes) = @_;
if (!&Open($mailFile))
	{
	warn "WARNING: Could not open: $mailFile\n";
	next;
	}
# Open temp files
my $tmpFile = "$mailFile.$$.temp";	# Will be new mailbox
my $debugDupeFile = "$mailFile.duplicates.$$.temp";	# Duplicate messages 
my $debugTrashedFile = "$mailFile.trashed.$$.temp";	# Already trashed messages
foreach my $f ($tmpFile, $debugDupeFile, $debugTrashedFile)
	{
	if (-e $f && !$force)
		{
		die "ERROR: Temp file exists: $f\n  Delete it first or use -f option if you KNOW it is okay to clobber it.\n";
		}
	if (!open($f, ">$f"))
		{
		die "WARNING: Could not open for write: $f\n";
		}
	}
my %msgHashes = ();	# Hashes of msgs output for this file
my $nMsgs = 0;		# Totals for this file
my $nDupes = 0;
my $nTrashed = 0;
my $nOutput = 0;
my $bytesMsgs = 0;
my $bytesDupes = 0;
my $bytesTrashed = 0;
my $bytesOutput = 0;
my $msg = "";		# Current message
while ($msg = &ReadMessage($mailFile))
	{
	$bytesMsgs += length($msg);
	my ($header, $body) = &BreakMessage($msg);
	if ($debugHash && ($msg =~ m/Pham/))
		{
		print "msg:$msg\n\n";
		}
	my $hash = &MakeMessageHash($header, $body);
	if (exists($trashHashes{$hash}))
		{
		# Message was already trashed
		$nTrashed++;
		$bytesTrashed += length($msg);
		print STDERR $deletedChar if $progressCharsPerLine;
		print $debugTrashedFile $msg;
		}
	elsif (exists($msgHashes{$hash}))
		{
		# Duplicate message
		$nDupes++;
		$bytesDupes += length($msg);
		print STDERR $duplicateChar if $progressCharsPerLine;
		print $debugDupeFile $msg;
		}
	else	{
		# New message, not trashed.  Output it.
		print $tmpFile $msg;
		$nOutput++;
		$bytesOutput += length($msg);
		$msgHashes{$hash} = 1;
		print STDERR $savedChar if $progressCharsPerLine;
		}
	$nMsgs++;
	print STDERR "  Messages found: $nMsgs Duplicates: $nDupes Previously moved: $nTrashed\r" if $nMessagesPerSummary 
			&& ($nMsgs % $nMessagesPerSummary) == 0;
	print STDERR "\n" if $progressCharsPerLine 
			&& ($nMsgs % $progressCharsPerLine) == 0;
	# Sanity checks
	$nMsgs == $nDupes + $nTrashed + $nOutput || die;
	$bytesMsgs == $bytesDupes + $bytesTrashed + $bytesOutput || die;
	die if ($nMsgs > 0 && $bytesMsgs <= 0);
	die if ($bytesMsgs > 0 && $nMsgs <= 0);
	}
print STDERR "  Messages found: $nMsgs Duplicates: $nDupes Previously moved: $nTrashed\r" if $nMessagesPerSummary;
warn "\n";
# Close files
&Close($mailFile) || die;
foreach my $f ($tmpFile, $debugDupeFile, $debugTrashedFile)
	{
	close($f) || die;
	}
# More sanity checks
if ((-s $mailFile) != (-s $debugDupeFile) + (-s $debugTrashedFile) + (-s $tmpFile))
	{
	warn "\nWARNING: Resulting file sizes do not balance.\n";
	warn "This is sometimes due to inconsistent carriage return (ctl-M)\n";
	warn "treatment at the ends of lines.\n";
	if (!$rmCR)
		{
		warn "If you wish, you can use the -cr option to normalize them.";
		warn "However, you might still get this warning.\n";
		}
	}
# Were any messages removed?
my $nRemoved = $nDupes + $nTrashed;
# Prepare to move input to backup file
my $bakFile = "$mailFile.$$.$thisProgram.bak";
if (-e $bakFile && (!$noop) && !unlink($bakFile))
	{
	die "\nERROR: Cannot delete old backup file: $bakFile\n";
	}
# Save input file as backup, and rename temp file to input file.
if ($noop)
	{
	$debug || unlink($tmpFile) || die "\nERROR: Cannot delete temp file: $tmpFile\n";
	}
elsif ($nRemoved == 0)
	{ 
	# No need to make a backup if nothing was changed.
	$debug || unlink($tmpFile) || die "\nERROR: Cannot delete temp file: $tmpFile\n";
	$bakFile = "(No backup because original file was unchanged)";
	}
else
	{
	rename($mailFile, $bakFile) || die "\nERROR: Could not rename $mailFile to $bakFile\n";
	rename($tmpFile, $mailFile) || die "\nERROR: Could not rename $tmpFile to $mailFile\n";
	}
if ($debug)
	{
	warn "\n\tDebug files saved: \n\t\t$tmpFile\n\t\t$debugDupeFile\n\t\t$debugTrashedFile\n";
	}
else	{
	unlink($debugDupeFile) || die "\nERROR: Could not delete $debugDupeFile\n";
	unlink($debugTrashedFile) || die "\nERROR: Could not delete $debugTrashedFile\n";
	}
if ($nRemoved && $rmTOC && !$noop)
	{
	foreach my $ext ( @tocExtensions )
		{
		my $tocFile = $mailFile;
		$tocFile =~ s/\.[^\.]*\Z//;
		# $tocFile .= ".toc";
		$tocFile .= ".$ext";
		if (-e $tocFile)
			{
			if (unlink($tocFile)) 
				{
				warn "Deleted $tocFile\n";
				}
			else	{
				warn "\nWARNING: Could not delete $tocFile\n";
				}
			}
		else	{
			# warn "(No .toc file found to delete: $tocFile )\n";
			}
		}
	}

return ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %msgHashes);
}

############ BreakMessage ##############
# Separate message into Header and Body, and canonicalize the header
# by unfolding continuation lines in the header, and
# removing carriage returns (ctrl-M) from the ends of lines
# in the body.
sub BreakMessage
{
@_ == 1 || die;
my ($t) = @_;
my $msg = $t;
# Remove carriage returns at the ends of lines:
while ($msg =~ s/\r+\n/\n/gm) {}
# Look for header/body separator
if ( $msg !~ m/\A(.+\n)+?(\r?)\n/m ) 
	{
	my $max = 2048;
	my $t = substr($msg,0,$max);	# Truncate to $max length
	warn "\nERROR: Could not find message header/body separator!  \n";
	warn "Mailbox may be corrupt. Ending line number read: $. \n\n";
	warn "Message (truncated to $max bytes): \n";
	warn "============================================\n";
	warn "$t\n";
	warn "============================================\n";
	die "\n";
	}
my $header = $&;
my $body = $';
# Unfold continuation lines:
while ($header =~ s/(\r?)\n([ \t])/$2/gm) {}
return( $header, $body );
}

############ MakeMessageHash ############
# The message hash returned will consist of the entire canonicalized
# header (minus the first line and any "Status:" lines), plus
# the length of the message body and a checksum of the message body.
sub MakeMessageHash
{
@_ == 2 || die;
my ($header, $body) = @_;
# Canonicalize the headers, and select only the headers that matter.
my $h = $header;
# Unfold any folded header lines.  See http://www.ietf.org/rfc/rfc0822.txt
# sec. 3.1.1 and http://www.ietf.org/rfc/rfc2822.txt sec 2.2.3.
### Hmm, old code was wrong?  Old code:
### $h =~ s/\A.*\n([ \t].*\n)*//;
# Already done by &BreakMessage:
# $h =~ s/\n([ \t])/$1/g;
my @headerLines = split(/(\r?)\n/, $h);
my $nheaderLines = scalar(@headerLines);
# warn "nheaderLines: $nheaderLines\n";
my @wanted = sort grep {m/\A($headersWantedPattern)/} @headerLines;
my $nWanted = scalar(@wanted);
# warn "nWanted: $nWanted\n";
my $headersWanted = join("\n", @wanted) . "\n";
# warn "headersWanted: $headersWanted\n\n";

# Now canonicalize the body.
# Ignore numbers at the end of attachment filenames.
# Numbers are added by the mailer to make the filenames unique.  But this
# means that if the same message is delivered twice, the
# attachment names will differ unless we do this.
#	Attachment Converted: "c:\documents and settings\daboo\my documents\home\email\eudora\attach\arch-02-10-04.htm"^M
my $oldBody = $body;
# print "------------------ Body -----------------------\n$body\n\n";
if ($body =~ s/^(Attachment\ Converted\:\ *\"([^\"]*?))\d+((\.?([^\.\"\\\/]*))\")/$1$3/mg)
	{
	# print "=============== Old ============\n";
	# print "$oldBody\n";
	# print "=============== New ============\n";
	# print "$body\n";
	# die "\n";
	}
# Same thing for embedded attachments.
#	130c130
#	< Embedded Content: image002149.gif: 00000001,345b86fc,00000000,60b15b20
#	---
#	> Embedded Content: image002162.gif: 00000001,345b86fc,00000000,60b15b20
$oldBody = $body;
if ($body =~ s/^(Embedded\ Content\:\ *([^\:]*?))\d+((\.?([^\.\"\\\/]*))\:)/$1$3/mg)
	{
	# print "=============== Old ============\n";
	# print "$oldBody\n";
	# print "=============== New ============\n";
	# print "$body\n";
	# die "\n";
	}

# Make the final hash/checksum:
my $length = length($body);
# Got this line from the perlfun manual page on unpack;
my $checksum = unpack("%32C*", $body);
# warn "checksum: $checksum\n";
my $hash = "$headersWanted\n$length\n$checksum\n";
# warn "=========================\nMakeMessageHash: \n$hash\n==============================\n";
return $hash;
}

############## WriteOrAppendFile ###############
sub WriteOrAppendFile
{
@_ >= 2 || die;
my ($f, $mode, @lines) = @_;
open($f, "$mode$f") || die "\nWriteOrAppendFile: Cannot write/append $mode file: $f\n";
print $f @lines;
close($f);
}

############## WriteFile ###############
sub WriteFile
{
@_ >= 1 || die;
my ($f, @lines) = @_;
&WriteOrAppendFile($f, ">", @lines);
}

############## AppendFile ###############
sub AppendFile
{
@_ >= 1 || die;
my ($f, @lines) = @_;
&WriteOrAppendFile($f, ">>", @lines);
}

####################################################################
####################### File reading with unread ###################
sub BEGIN
{
# Private data on currently open files:
my %buffers = ();	# Line buffers for all open files.
my %eof = ();		# Already hit eof?
############## Open ##############
# Open for reading
sub Open
{
@_ == 1 || die;
my ($f) = @_;
$buffers{$f} = [];
$eof{$f} = 0;
return(open($f,"<$f"));
}
############## Close ##############
sub Close
{
@_ == 1 || die;
my ($f) = @_;
exists($buffers{$f}) || die;
warn "Close: WARNING: Closed file with unread data in line buffer\n"
	if pop(@{$buffers{$f}});
delete($buffers{$f});
delete($eof{$f});
return(close($f));
}
############## ReadLine ##############
sub ReadLine
{
@_ == 1 || die;
my ($f) = @_;
exists($buffers{$f}) || die "ERROR: Attempt to read from unopen file: $f\n";
my $line = pop(@{$buffers{$f}});
return $line if defined($line);
return undef if $eof{$f};
$line = <$f>;
$line =~ s/\r+\n/\n/ if $rmCR && defined($line);	# Global $rmCR
# print $line if defined($line);
$eof{$f} = 1 if !defined($line);
return $line;
}
############## UnReadLine ##############
sub UnReadLine
{
@_ == 2 || die;
my ($f, $line) = @_;
exists($buffers{$f}) || die;
push(@{$buffers{$f}}, $line);
return 1;
}
} # sub BEGIN