#! /usr/bin/perl -w # NAME: cleanemail.perl - Remove deleted or duplicate messages from POP # mailboxes. # # Author: David Booth 4-Oct-2002. Improved 9-Jan-2006, 11-Mar-2009 # and 24-Nov-2009. For contact info see: http://dbooth.org/ # # License: GNU GPL: http://www.opensource.org/licenses/gpl-license.php # # INTRODUCTION: # This program removes duplicate or previously deleted messages # from POP mailbox files, permanently deleting them. # It is especially useful for cases in which your email client loses # track of which messages it has downloaded from your mail server, # and downloads all messages again, thus creating hundreds or thousands # of duplicate messages in your Inbox. # # This program works for Evolution and previously worked for Eudora # (and hopefully still does, though I haven't tested it for quite a while). # It should also work for other mail clients that use the mbox file # format to store messages, though it may not know to delete associated # index files. Please let me know your experience if you try it with # other email clients. # # QUICK START FOR EVOLUTION USERS: # 1. In Evolution, create a top level mail folder called "Trash-emptied" # (if you don't already have one) and copy the entire contents of # your Trash folder to the Trash-emptied folder. This will allow # previously deleted messages to be detected and re-deleted from your # Inbox and other folders. It doesn't matter if there are duplicate # messages in this folder. # 2. Exit from Evolution! # 3. cd ~/.evolution/mail/local # 4. Run: perl cleanemail.perl * # Or, if you use sub-folders, run: perl cleanemail.perl * *.sbd/* # # QUICK START FOR EUDORA USERS: # 1. In Eudora, you should # "Special->Compact Mailboxes". This normalizes the *.mbx files. # Otherwise, the *.toc files seem to indicate which messages have # been logically but not physically moved from the *.mbx files. # 2. Exit from Eudora! # 3. cd your-email-directory # where your-email-directory is your email directory. # 4. Run: perl cleanemail.perl *.mbx # # OVERVIEW: # This program deletes duplicate or previously moved or trashed # messages from mailboxes that are stored in mbox format: # http://en.wikipedia.org/wiki/Mbox # http://www.qmail.org/man/man5/mbox.html # Messages deleted by this program are REALLY deleted, i.e., # they are NOT just moved to a Trash folder. # # Inbox(s) are processed slightly differently from non-inbox files. # Since users often move messages from the Inbox to other folders # (including the trash), any messages that appear in # any non-inbox file will be deleted from the Inbox file(s). # Furthermore, duplicate messages in any file will also be deleted. # # WARNING: There is a danger to this algorithm. Suppose you receive # multiple copies of a message in your Inbox, you manually moving # one of the duplicates to the trash, and then you run this # program. In this case, this program will detect the message in # your trash and delete all copies of it from your Inbox, under # the assumption that you intended to trash all copies, leaving # only the single copy of the message in your trash. # To avoid this problem, you should not manually delete ANY messages # when you receive duplicates. Instead, run this program to delete # the duplicates. # # Files are modified in place. However, for safety # all work is initially performed on temporary files. After a file # has been processed, it is renamed to *.cleanemail.bak # (as a backup) and then the temp file is renamed to take # the place of the original file. If you want the .bak files # to be automatically deleted afterward, use the -db or # -deleteBackups option. # # Certain files get special treatment: # Out.mbx -- Ignored. Not processed at all. # Sent.mbx -- Ignored. Not processed at all. # Drafts.mbx -- Ignored. Not processed at all. # Default trash files (for special treatment as trash): # Trash.mbx -- Processed as a trash file. # Trash-*.mbx -- Processed as a trash file. # SPAMn.mbx -- (Where n is a number) Processed as a trash file. # _SPAMn.mbx -- (Where n is a number) Processed as a trash file. # _SuspectedSPAM.mbx -- Processed as a trash file. # Extensions of evolution and other files that are also ignored: my @extensionsToIgnore = qw( ev-summary-meta cmeta ev-summary index data temp bak db sbd ); my $extensionsToIgnorePattern = join("|", map {quotemeta($_)} @extensionsToIgnore); ############### Maintenance Note ################## # Earlier versions of this program distinguised "trash" files from # regular mailbox files, with the idea that if a message appeared in # a trash file, it should be deleted from the regular mailbox files # under the assumption that the user had previously deleted it. # However, I later realized that instead the program should distinguish # the *Inbox* from other mailbox files, and if a message appears in # any of the other mailbox files (including trash files) it should be # deleted from the Inbox under the assumption that the user had previously # moved it *from* the Inbox to that folder (either because of deleting # it or just for organizational purposes). Therefore, although the code # below refers to "trashFiles", these are now any regular mail files # *except* for the Inbox. ################################################### # # A note about carriage returns (ctl-M, or "CR") # ============================================== # MS Windows machines seem to automatically strip CR off the end # of a line on reading, and add a CR when writing. # This caused me a lot of grief in debugging this program, # because the number of bytes read/written were not matching the file sizes! # To make things work right under W2K, by default I delete CRs # at the end of a line on input. (This is controlled by the global # $rmCR flag -- see &ReadLine.) # Since I delete any number of MULTIPLE CRs at the # end of a line (not just one), this means that the resulting messages may be # slightly DIFFERENT from the original messages. # In the 5+ years of using this script I have not seen this cause a problem, # but it seems plausible to # me that it could for some data. I'm afraid I don't know the # rules for email message syntax well enough to know whether it could. ########################## Main ############################# my $thisProgram = "cleanemail"; # What pattern indicates the start of a new message? # Here are some normal ones: # From csail-related-bounces@lists.csail.mit.edu Wed Sep 8 18:34:17 2004 # From ???@??? Mon Sep 23 11:23:37 2002 # Here are some weird ones (probably from spammers): # From [?var=TAGMAILFROM] Wed Jan 21 12:43:03 2009 # From \Annette Gorman" Mon Sep 15 21:49:32 2008 # From _almansour@acee.net Thu Mar 20 10:00:03 2008 # From a%CUST_WORD@disnet1.org Wed Dec 24 23:42:24 2008 # From a-manterola@ej-gv.es Mon Sep 8 16:06:49 2008 # From de'an@4ur.com Sat Jan 5 15:29:20 2008 # From et{ist{_1995@dudapropaganda.com.br Tue Oct 28 21:04:57 2008 # From manager#05@wachovia.com Thu Dec 11 16:51:34 2008 # my $messageStartPattern = "\\AFrom \\s*[^\\@]*\\@[^\\@]*.*\\d+\\:\\d+\\:\\d+\\s+\\d+\\s*\\Z"; # Don't require an @ because some spam doesn't use it: my $messageStartPattern = "\\AFrom .*\\d+\\:\\d+\\:\\d+\\s+\\d+\\s*\\Z"; my $maxMessageStartLength = 400; # Anything longer than this may indicate an error. # warn "maxMessageStartLength: $maxMessageStartLength messageStartPattern: $messageStartPattern\n"; my $progressCharsPerLine = 50; # Set to 0 if progress indicator is not wanted. my $savedChar = "."; # Indicates a message retained my $deletedChar = "X"; # Indicates a trashed message deleted my $duplicateChar = "x"; # Indicates a duplicate message deleted my $nMessagesPerSummary = 100; # Set to 0 for no progress summary. $progressCharsPerLine = 0 if $nMessagesPerSummary; # Override? # In comparing messages to determine duplicates, these headers will # be compared and all other headers will be ignored. (A checksum of # the message body will always be compared also. See the &Hash() # function below.) # Do not confuse the "From" header with the "From:' # header. The former has a time-stamp in it that we do NOT # want to compare. my @headersWanted = qw( Return-Path: Date: From: Subject: Sender: To: ); my $headersWantedPattern = join("|", map {quotemeta($_)} @headersWanted); # warn "headersWantedPattern: $headersWantedPattern\n"; # Get options and look for trash files my $force = 0; # -f -- Force. (Clobber existing .temp files) my $debug = 0; # -debug -- Keep .temp files. my $noop = 0; # -noop -- No operation. Show what would be done, # but do not save the results. my $verbose = 0; # -v -- Show lots of info. my $rmCR = 1; # -cr -- Remove carriage returns at ends of lines. my $rmTOC = 1; # -toc -- Remove the .toc files (but not Out.toc) my @tocExtensions = qw( toc ev-summary ); # Extensions of files to remove # if $rmTOC is set. my $deleteTrashed = 1; # -deleteTrashed # -- Delete previously trashed or moved messages. # This causes messages found in trash folders # to be deleted from non-trash folders. my $deleteBackups = 0; # -deleteBackups my $debugHash = 0; my $trashPattern = "(Trash(.*)|(_?)((Suspected)?)SPAM([\-\_]\\d*)"; my $outPattern = "(Out|Outbox|Sent|Drafts|Templates)"; my $inPattern = "(In|Inbox)"; my @trashFiles = (); my @inFiles = (); my @args = (); my @ignoredFiles = (); while(@ARGV) { my $arg = shift @ARGV; if (0) {} elsif ($arg eq "-maxMessageStartLength") { $maxMessageStartLength = shift @ARGV; } elsif ($arg eq "-v") { $verbose = 1; } elsif ($arg eq "-n") { $noop = 1; } elsif ($arg eq "-noop") { $noop = 1; } elsif ($arg eq "-cr") { $rmCR = 1; } elsif ($arg eq "-f") { $force = 1; } elsif ($arg eq "-toc") { $rmTOC = 1; } elsif ($arg eq "-debug") { $debug = 1; } elsif ($arg eq "-deleteBackups") { $deleteBackups = 1; } elsif ($arg eq "-db") { $deleteBackups = 1; } elsif ($arg eq "-deleteTrashed") { $deleteTrashed = 1; } elsif ($arg eq "-dt") { $deleteTrashed = 1; } elsif ($arg eq "-i") { push(@inFiles, shift @ARGV); } elsif ($arg eq "-t") { push(@trashFiles, shift @ARGV); } elsif (-d $arg) { push(@ignoredFiles, $arg); } elsif ($arg =~ m/\.($extensionsToIgnorePattern)\Z/i) { push(@ignoredFiles, $arg); } elsif ($arg =~ m/\A(.*[\/\\])?$outPattern\Z/i) { push(@ignoredFiles, $arg); } ##### No longer check for trashPattern, because all files except ##### Inbox will now be treated as trash files: # elsif ($arg =~ m/\A(.*[\/\\])?$trashPattern\Z/i) { push(@trashFiles, $arg); } elsif ($arg =~ m/\A(.*[\/\\])?$inPattern\Z/i) { push(@inFiles, $arg); } elsif ($arg =~ m/\A\-/) { warn "Unknown option: $arg\n"; &DieUsage(); } else { push(@args, $arg); } } my $nIgnored = scalar(@ignoredFiles); # Number of files ignored @ARGV = @args; die "ERROR: Bad maxMessageStartLength\n" if $maxMessageStartLength <= 0; @ARGV || @inFiles || @trashFiles || $nIgnored || &DieUsage(); ##### These lines convert from the old approach of treating everything ##### as Inboxes except trash files, to the new approach of treating ##### everything *except* the Inboxes as trash files: my %inFiles = map {($_,$_)} @inFiles; my %trashFiles = map {($_,$_)} @trashFiles; my %otherFiles = map {($_,$_)} @ARGV; # Force trashFiles to not be considered inFiles: foreach my $f (keys %trashFiles) { delete($inFiles{$f}) if exists($inFiles{$f}); delete($otherFiles{$f}) if exists($otherFiles{$f}); } # Force inFiles to not be considered trashFiles: foreach my $f (keys %inFiles) { delete($trashFiles{$f}) if exists($trashFiles{$f}); delete($otherFiles{$f}) if exists($otherFiles{$f}); } # Treat other files as trash files by default: %trashFiles = map{($_,$_)} (keys %otherFiles, keys %trashFiles); @trashFiles = sort keys %trashFiles; @ARGV = sort keys %inFiles; if (@trashFiles && !$deleteTrashed) { warn "Non-inbox files found: \n\t",join("\n\t", @trashFiles), "\n"; warn "\nNOTE: Non-inbox files will be treated as Inbox files\n"; warn "because the -dt option was NOT specified. If desired, you\n"; warn "can use the -dt option to cause all messages found in any\n"; warn "trash file to be deleted from the regular mailbox files.\n\n"; @ARGV = (@trashFiles, @ARGV); @trashFiles = (); } warn "Files ignored: \n\t", join("\n\t", @ignoredFiles), "\n" if $nIgnored; warn "Non-inbox files to process: \n\t", join("\n\t", @trashFiles), "\n"; warn "(Messages in non-inbox files will be deleted from Inbox files.)\n" if $deleteTrashed; warn "Inbox files to process: \n\t", join("\n\t", @ARGV), "\n"; ################### DieUsage ################### sub DieUsage { die "Usage: cleanemail.perl [options ...] file1.mbox ... Common options: -noop Do nothing, but show what would be done. -i inboxFile Process inboxFile as an Inbox even if it would not be recognized as an Inbox (based on its name). -t trashFile Process trashFile as a trash file or other non-inbox file even if it appears to be an Inbox (based on its name). -db Delete backup files when finished. Other options not usually needed: -debug Show debugging output -cr Remove carriage returns (Default) -f Force overwrite of existing temp files -toc Remove corresponding *.toc and *.ev-summary files (Default) -dt Delete trash (previously deleted mail) from inboxes. (Default) -maxMessageStartLength mmsl Set \$maxMessageStartLength to mmsl (Default is $maxMessageStartLength) "; } # As a precaution, look for any potential email client lock files. # We don't want to be running this while the email client is running. foreach my $trashFile (@trashFiles, @ARGV) { foreach my $f ("OWNER.LOK") { my $lockFile = $trashFile; $lockFile =~ s/[^\\\/]+\Z//; # Chop off tail $lockFile .= $f; die "DANGER! I found a possible lock file: $lockFile\n You must exit your email client before running this program!\n" if -e $lockFile; } } my ($tnMsgs, $tnDupes, $tnTrashed, $tnOutput) = (0, 0, 0, 0); # Totals # Process trash files first, so that we know what was # previously deleted. my %trashHashes = (); # Hashes of all known trash msgs foreach my $trashFile (@trashFiles) { print STDERR "Processing non-inbox: $trashFile\n"; my ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %newHashes) = &ProcessMailbox($trashFile, ()); die if $nTrashed != 0; if ($nMessagesPerSummary == 0) { warn "\n"; warn " Total messages: $nMsgs\n"; # warn " Duplicates removed ($duplicateChar): $nDupes\n"; warn " Duplicates removed: $nDupes\n"; warn " Messages output: $nOutput\n"; } $tnMsgs += $nMsgs; $tnDupes += $nDupes; $tnTrashed += $nTrashed; $tnOutput += $nOutput; &DeleteOrSaveBackup($bakFile, $nDupes+$nTrashed); %trashHashes = (%trashHashes, %newHashes); } # Process all Inbox mailboxes %trashFiles = map {($_,$_)} @trashFiles; foreach my $mailFile (@ARGV) { if (exists($trashFiles{$mailFile})) { warn "Skipping non-inbox file already processed: $mailFile\n"; next; } warn "Processing inbox: $mailFile\n"; my ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %msgHashes) = &ProcessMailbox($mailFile, %trashHashes); if ($nMessagesPerSummary == 0) { warn "\n"; warn " Messages found: $nMsgs\n"; # warn " Duplicates deleted ($duplicateChar): $nDupes\n"; # warn " Previously moved messages deleted ($deletedChar): $nTrashed\n"; warn " Duplicates deleted: $nDupes\n"; warn " Previously moved messages deleted: $nTrashed\n"; warn " Messages output: $nOutput\n"; } $tnMsgs += $nMsgs; $tnDupes += $nDupes; $tnTrashed += $nTrashed; $tnOutput += $nOutput; &DeleteOrSaveBackup($bakFile, $nDupes+$nTrashed); } if (@ARGV + @trashFiles > 1) { warn "\n"; warn "========= Grand Totals =========\n"; warn "Total messages found: $tnMsgs\n"; # warn "Total duplicates deleted ($duplicateChar): $tnDupes\n"; # warn "Total previously moved messages deleted ($deletedChar): $tnTrashed\n"; warn "Total duplicates deleted: $tnDupes\n"; warn "Total previously moved messages deleted: $tnTrashed\n"; warn "Total Messages output: $tnOutput\n"; } exit 0; ########### DeleteOrSaveBackup ############# sub DeleteOrSaveBackup { @_ == 2 || die; my ($bakFile, $nDupesOrTrashed) = @_; if ($nDupesOrTrashed == 0) { warn " Original file was unchanged -- no need for a backup file.\n\n"; } elsif ($deleteBackups) { warn " Could not delete backup file: $bakFile\n\n" if !unlink($bakFile); } else { warn " Original file saved in: $bakFile\n\n"; } } ############### ReadMessage ############### # Read and return the next message from the given $mailFile. # Return the empty string "" if EOF. sub ReadMessage { @_ == 1 || die; my ($mailFile) = @_; my $msg = &ReadLine($mailFile); return("") if !defined($msg); my $bytesRead = length($msg); # First line should match messageStartPattern: if ($msg !~ m/$messageStartPattern/) { die "ERROR: Unrecognized message start line: $msg\nDoes not match messageStartPattern: $messageStartPattern\n"; } # Read lines until we reach the start of the next message. while (defined($line = &ReadLine($mailFile))) { $bytesRead += length($line); # See if it's the start of a new message. # From ???@??? Mon Sep 23 11:23:37 2002 if ($line =~ m/$messageStartPattern/ && $msg) { # warn "MessageStart: $line"; if (length($line) > $maxMessageStartLength) { die "ERROR: Possibly corrupt mailbox. Message start line is \nlonger than $maxMessageStartLength .\nMessage start line was: $line\nUse -maxMessageStartLength n option to specify a \nlarger max if this start line looks okay.\n"; } # Push back this line and return what we have so far. &UnReadLine($mailFile, $line); $bytesRead -= length($line); last; } $msg .= $line; $bytesRead == length($msg) || die; } return($msg); } ############## ProcessMailbox ################ # Given a mailFile and (possibly) some hashes of trashed messages, # rewrite the given mailFile. # Return: # backup filename # number of messages read # number of duplicates removed # number of previously trashed messages removed # hashes of remaining messages sub ProcessMailbox { @_ >= 1 || die; my ($mailFile, %trashHashes) = @_; if (!&Open($mailFile)) { warn "WARNING: Could not open: $mailFile\n"; next; } # Open temp files my $tmpFile = "$mailFile.$$.temp"; # Will be new mailbox my $debugDupeFile = "$mailFile.duplicates.$$.temp"; # Duplicate messages my $debugTrashedFile = "$mailFile.trashed.$$.temp"; # Already trashed messages foreach my $f ($tmpFile, $debugDupeFile, $debugTrashedFile) { if (-e $f && !$force) { die "ERROR: Temp file exists: $f\n Delete it first or use -f option if you KNOW it is okay to clobber it.\n"; } if (!open($f, ">$f")) { die "WARNING: Could not open for write: $f\n"; } } my %msgHashes = (); # Hashes of msgs output for this file my $nMsgs = 0; # Totals for this file my $nDupes = 0; my $nTrashed = 0; my $nOutput = 0; my $bytesMsgs = 0; my $bytesDupes = 0; my $bytesTrashed = 0; my $bytesOutput = 0; my $msg = ""; # Current message while ($msg = &ReadMessage($mailFile)) { $bytesMsgs += length($msg); my ($header, $body) = &BreakMessage($msg); if ($debugHash && ($msg =~ m/Pham/)) { print "msg:$msg\n\n"; } my $hash = &MakeMessageHash($header, $body); if (exists($trashHashes{$hash})) { # Message was already trashed $nTrashed++; $bytesTrashed += length($msg); print STDERR $deletedChar if $progressCharsPerLine; print $debugTrashedFile $msg; } elsif (exists($msgHashes{$hash})) { # Duplicate message $nDupes++; $bytesDupes += length($msg); print STDERR $duplicateChar if $progressCharsPerLine; print $debugDupeFile $msg; } else { # New message, not trashed. Output it. print $tmpFile $msg; $nOutput++; $bytesOutput += length($msg); $msgHashes{$hash} = 1; print STDERR $savedChar if $progressCharsPerLine; } $nMsgs++; print STDERR " Messages found: $nMsgs Duplicates: $nDupes Previously moved: $nTrashed\r" if $nMessagesPerSummary && ($nMsgs % $nMessagesPerSummary) == 0; print STDERR "\n" if $progressCharsPerLine && ($nMsgs % $progressCharsPerLine) == 0; # Sanity checks $nMsgs == $nDupes + $nTrashed + $nOutput || die; $bytesMsgs == $bytesDupes + $bytesTrashed + $bytesOutput || die; die if ($nMsgs > 0 && $bytesMsgs <= 0); die if ($bytesMsgs > 0 && $nMsgs <= 0); } print STDERR " Messages found: $nMsgs Duplicates: $nDupes Previously moved: $nTrashed\r" if $nMessagesPerSummary; warn "\n"; # Close files &Close($mailFile) || die; foreach my $f ($tmpFile, $debugDupeFile, $debugTrashedFile) { close($f) || die; } # More sanity checks if ((-s $mailFile) != (-s $debugDupeFile) + (-s $debugTrashedFile) + (-s $tmpFile)) { warn "\nWARNING: Resulting file sizes do not balance.\n"; warn "This is sometimes due to inconsistent carriage return (ctl-M)\n"; warn "treatment at the ends of lines.\n"; if (!$rmCR) { warn "If you wish, you can use the -cr option to normalize them."; warn "However, you might still get this warning.\n"; } } # Were any messages removed? my $nRemoved = $nDupes + $nTrashed; # Prepare to move input to backup file my $bakFile = "$mailFile.$$.$thisProgram.bak"; if (-e $bakFile && (!$noop) && !unlink($bakFile)) { die "\nERROR: Cannot delete old backup file: $bakFile\n"; } # Save input file as backup, and rename temp file to input file. if ($noop) { $debug || unlink($tmpFile) || die "\nERROR: Cannot delete temp file: $tmpFile\n"; } elsif ($nRemoved == 0) { # No need to make a backup if nothing was changed. $debug || unlink($tmpFile) || die "\nERROR: Cannot delete temp file: $tmpFile\n"; $bakFile = "(No backup because original file was unchanged)"; } else { rename($mailFile, $bakFile) || die "\nERROR: Could not rename $mailFile to $bakFile\n"; rename($tmpFile, $mailFile) || die "\nERROR: Could not rename $tmpFile to $mailFile\n"; } if ($debug) { warn "\n\tDebug files saved: \n\t\t$tmpFile\n\t\t$debugDupeFile\n\t\t$debugTrashedFile\n"; } else { unlink($debugDupeFile) || die "\nERROR: Could not delete $debugDupeFile\n"; unlink($debugTrashedFile) || die "\nERROR: Could not delete $debugTrashedFile\n"; } if ($nRemoved && $rmTOC && !$noop) { foreach my $ext ( @tocExtensions ) { my $tocFile = $mailFile; $tocFile =~ s/\.[^\.]*\Z//; # $tocFile .= ".toc"; $tocFile .= ".$ext"; if (-e $tocFile) { if (unlink($tocFile)) { warn "Deleted $tocFile\n"; } else { warn "\nWARNING: Could not delete $tocFile\n"; } } else { # warn "(No .toc file found to delete: $tocFile )\n"; } } } return ($bakFile, $nMsgs, $nDupes, $nTrashed, $nOutput, %msgHashes); } ############ BreakMessage ############## # Separate message into Header and Body, and canonicalize the header # by unfolding continuation lines in the header, and # removing carriage returns (ctrl-M) from the ends of lines # in the body. sub BreakMessage { @_ == 1 || die; my ($t) = @_; my $msg = $t; # Remove carriage returns at the ends of lines: while ($msg =~ s/\r+\n/\n/gm) {} # Look for header/body separator if ( $msg !~ m/\A(.+\n)+?(\r?)\n/m ) { my $max = 2048; my $t = substr($msg,0,$max); # Truncate to $max length warn "\nERROR: Could not find message header/body separator! \n"; warn "Mailbox may be corrupt. Ending line number read: $. \n\n"; warn "Message (truncated to $max bytes): \n"; warn "============================================\n"; warn "$t\n"; warn "============================================\n"; die "\n"; } my $header = $&; my $body = $'; # Unfold continuation lines: while ($header =~ s/(\r?)\n([ \t])/$2/gm) {} return( $header, $body ); } ############ MakeMessageHash ############ # The message hash returned will consist of the entire canonicalized # header (minus the first line and any "Status:" lines), plus # the length of the message body and a checksum of the message body. sub MakeMessageHash { @_ == 2 || die; my ($header, $body) = @_; # Canonicalize the headers, and select only the headers that matter. my $h = $header; # Unfold any folded header lines. See http://www.ietf.org/rfc/rfc0822.txt # sec. 3.1.1 and http://www.ietf.org/rfc/rfc2822.txt sec 2.2.3. ### Hmm, old code was wrong? Old code: ### $h =~ s/\A.*\n([ \t].*\n)*//; # Already done by &BreakMessage: # $h =~ s/\n([ \t])/$1/g; my @headerLines = split(/(\r?)\n/, $h); my $nheaderLines = scalar(@headerLines); # warn "nheaderLines: $nheaderLines\n"; my @wanted = sort grep {m/\A($headersWantedPattern)/} @headerLines; my $nWanted = scalar(@wanted); # warn "nWanted: $nWanted\n"; my $headersWanted = join("\n", @wanted) . "\n"; # warn "headersWanted: $headersWanted\n\n"; # Now canonicalize the body. # Ignore numbers at the end of attachment filenames. # Numbers are added by the mailer to make the filenames unique. But this # means that if the same message is delivered twice, the # attachment names will differ unless we do this. # Attachment Converted: "c:\documents and settings\daboo\my documents\home\email\eudora\attach\arch-02-10-04.htm"^M my $oldBody = $body; # print "------------------ Body -----------------------\n$body\n\n"; if ($body =~ s/^(Attachment\ Converted\:\ *\"([^\"]*?))\d+((\.?([^\.\"\\\/]*))\")/$1$3/mg) { # print "=============== Old ============\n"; # print "$oldBody\n"; # print "=============== New ============\n"; # print "$body\n"; # die "\n"; } # Same thing for embedded attachments. # 130c130 # < Embedded Content: image002149.gif: 00000001,345b86fc,00000000,60b15b20 # --- # > Embedded Content: image002162.gif: 00000001,345b86fc,00000000,60b15b20 $oldBody = $body; if ($body =~ s/^(Embedded\ Content\:\ *([^\:]*?))\d+((\.?([^\.\"\\\/]*))\:)/$1$3/mg) { # print "=============== Old ============\n"; # print "$oldBody\n"; # print "=============== New ============\n"; # print "$body\n"; # die "\n"; } # Make the final hash/checksum: my $length = length($body); # Got this line from the perlfun manual page on unpack; my $checksum = unpack("%32C*", $body); # warn "checksum: $checksum\n"; my $hash = "$headersWanted\n$length\n$checksum\n"; # warn "=========================\nMakeMessageHash: \n$hash\n==============================\n"; return $hash; } ############## WriteOrAppendFile ############### sub WriteOrAppendFile { @_ >= 2 || die; my ($f, $mode, @lines) = @_; open($f, "$mode$f") || die "\nWriteOrAppendFile: Cannot write/append $mode file: $f\n"; print $f @lines; close($f); } ############## WriteFile ############### sub WriteFile { @_ >= 1 || die; my ($f, @lines) = @_; &WriteOrAppendFile($f, ">", @lines); } ############## AppendFile ############### sub AppendFile { @_ >= 1 || die; my ($f, @lines) = @_; &WriteOrAppendFile($f, ">>", @lines); } #################################################################### ####################### File reading with unread ################### sub BEGIN { # Private data on currently open files: my %buffers = (); # Line buffers for all open files. my %eof = (); # Already hit eof? ############## Open ############## # Open for reading sub Open { @_ == 1 || die; my ($f) = @_; $buffers{$f} = []; $eof{$f} = 0; return(open($f,"<$f")); } ############## Close ############## sub Close { @_ == 1 || die; my ($f) = @_; exists($buffers{$f}) || die; warn "Close: WARNING: Closed file with unread data in line buffer\n" if pop(@{$buffers{$f}}); delete($buffers{$f}); delete($eof{$f}); return(close($f)); } ############## ReadLine ############## sub ReadLine { @_ == 1 || die; my ($f) = @_; exists($buffers{$f}) || die "ERROR: Attempt to read from unopen file: $f\n"; my $line = pop(@{$buffers{$f}}); return $line if defined($line); return undef if $eof{$f}; $line = <$f>; $line =~ s/\r+\n/\n/ if $rmCR && defined($line); # Global $rmCR # print $line if defined($line); $eof{$f} = 1 if !defined($line); return $line; } ############## UnReadLine ############## sub UnReadLine { @_ == 2 || die; my ($f, $line) = @_; exists($buffers{$f}) || die; push(@{$buffers{$f}}, $line); return 1; } } # sub BEGIN