summaryrefslogtreecommitdiff
path: root/imapdedup.py
diff options
context:
space:
mode:
Diffstat (limited to 'imapdedup.py')
-rwxr-xr-ximapdedup.py588
1 files changed, 588 insertions, 0 deletions
diff --git a/imapdedup.py b/imapdedup.py
new file mode 100755
index 0000000..993ec17
--- /dev/null
+++ b/imapdedup.py
@@ -0,0 +1,588 @@
+#! /usr/bin/env python3
+#
+# imapdedup.py
+#
+# Looks for duplicate messages in a set of IMAP mailboxes and removes all but the first.
+# Comparison is normally based on the Message-ID header.
+#
+# Default behaviour is purely to mark the duplicates as deleted. Some mail clients
+# will allow you to view these and undelete them if you change your mind.
+#
+# Copyright (c) 2013-2020 Quentin Stafford-Fraser.
+# All rights reserved, subject to the following:
+#
+#
+# This is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this software; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+# USA.
+#
+
+import getpass
+import hashlib
+import imaplib
+import os
+import argparse
+import re
+import socket
+import sys
+from typing import List, Dict, Tuple, Optional, Type, Any
+
+from email.parser import BytesParser
+from email.message import Message
+from email.errors import HeaderParseError
+from email.header import decode_header
+
+# Increase the max line length that imaplib expects to get back from the server,
+# since we're often dealing with big folders and large numbers of messages.
+
+imaplib._MAXLINE = max(10_000_000, imaplib._MAXLINE)
+
+# If you choose the '-t' option, what tag (IMAP flag name)
+# shall we use on the server?
+TAG_NAME='duplicated'
+
+class ImapDedupException(Exception):
+ pass
+
+
+def check_response(resp: Tuple[str, List[bytes]]):
+ """
+ IMAP responses should normally begin 'OK'. Strip that off, or raise
+ an exception if it isn't there.
+ """
+ status, value = resp
+ if status != "OK":
+ raise ImapDedupException("Got response: %s from server" % str(value))
+ return value
+
+
+def get_arguments(args: Optional[List[str]] = None) -> Tuple[argparse.Namespace, List[str]]:
+ """
+ Parse the given command-line arguments - defaults to using sys.argv
+ """
+
+ parser = argparse.ArgumentParser(
+ description="Mark duplicate messages in IMAP mailboxes for deletion"
+ )
+ parser.add_argument(
+ "-P", "--process", dest="process", help="IMAP process to access mailboxes"
+ )
+ parser.add_argument("-s", "--server", dest="server", help="IMAP server")
+ parser.add_argument("-p", "--port", dest="port", help="IMAP server port", type=int)
+ parser.add_argument("-x", "--ssl", dest="ssl", action="store_true", help="Use SSL")
+ parser.add_argument("-X", "--starttls", dest="starttls", action="store_true", help="Require STARTTLS")
+ parser.add_argument("-u", "--user", dest="user", help="IMAP user name")
+ parser.add_argument("-K", "--keyring", dest="keyring", help="Keyring name to get password")
+ parser.add_argument(
+ "-w",
+ "--password",
+ dest="password",
+ help="IMAP password (Will prompt if not specified)",
+ )
+ parser.add_argument(
+ "-v", "--verbose", dest="verbose", action="store_true", help="Verbose mode"
+ )
+ parser.add_argument(
+ "-S", "--show", dest="show", action="store_true", help="Show duplicated messages"
+ )
+ parser.add_argument(
+ "-n",
+ "--dry-run",
+ dest="dry_run",
+ action="store_true",
+ help="Don't actually do anything, just report what would be done",
+ )
+ parser.add_argument(
+ "-c",
+ "--checksum",
+ dest="use_checksum",
+ action="store_true",
+ help="Use a checksum of several mail headers, instead of the Message-ID",
+ )
+ parser.add_argument(
+ "-b",
+ "--sentbefore",
+ dest="sent_before",
+ help="Only process messages sent before given date, given as d-m-y, e.g: 1-Feb-2020. Useful when there are many duplicates of each message",
+ )
+ parser.add_argument(
+ "-m",
+ "--checksum-with-id",
+ dest="use_id_in_checksum",
+ action="store_true",
+ help="Include the Message-ID (if any) in the -c checksum.",
+ )
+ parser.add_argument(
+ "--no-close",
+ dest="no_close",
+ action="store_true",
+ help='Do not "close" mailbox when done. Some servers will purge deleted messages on a close command.',
+ )
+ parser.add_argument(
+ "-l",
+ "--list",
+ dest="just_list",
+ action="store_true",
+ help="Just list mailboxes",
+ )
+ parser.add_argument(
+ "-r",
+ "--recursive",
+ dest="recursive",
+ action="store_true",
+ help="Remove duplicates recursively",
+ )
+ parser.add_argument(
+ "-R",
+ "--reverse",
+ dest="reverse",
+ action="store_true",
+ help="Walk through the folders in reverse order",
+ )
+ parser.add_argument(
+ "-t", "--only-tag", dest="only_tag", action="store_true",
+ help="Tag duplicates with '%s' instead of deleting them" % TAG_NAME
+ )
+ parser.add_argument('mailbox', nargs='*')
+
+ options = parser.parse_args(args)
+ mboxes = options.mailbox
+
+ if ((not options.server) or (not options.user)) and not options.process:
+ sys.stderr.write(
+ "\nError: Must specify server, user, and at least one mailbox.\n\n"
+ )
+ parser.print_help()
+ sys.exit(1)
+
+ if options.recursive and len(mboxes) > 1:
+ sys.stderr.write("\nError: You can only specify one mailbox if you use -r.\n")
+ sys.exit(1)
+
+ if options.use_id_in_checksum and not options.use_checksum:
+ sys.stderr.write("\nError: If you use -m you must also use -c.\n")
+ sys.exit(1)
+
+ if options.keyring:
+ import keyring
+ options.password = keyring.get_password(options.keyring, options.user)
+
+ if not options.password and not options.process:
+ # Read from IMAPDEDUP_PASSWORD env variable, or prompt for one.
+ options.password = os.getenv("IMAPDEDUP_PASSWORD") or getpass.getpass()
+
+ return (options, mboxes)
+
+
+# Thanks to http://www.doughellmann.com/PyMOTW/imaplib/
+list_response_pattern = re.compile(
+ rb'\((?P<flags>.*?)\) "(?P<delimiter>.*)" (?P<name>.*)'
+)
+
+def parse_list_response(line: bytes):
+ if not isinstance(line, bytes):
+ return None
+ m = list_response_pattern.match(line)
+ if m is None:
+ sys.stderr.write("\nError: parsing list response '{}'".format(str(line)))
+ sys.exit(1)
+ flags, delimiter, mailbox_name = m.groups()
+ mailbox_name = mailbox_name.strip(b'"')
+ return (flags, delimiter, mailbox_name)
+
+
+def str_header(parsed_message: Message, name: str) -> str:
+ """"
+ Return the value (of the first instance, if more than one) of
+ the given header, as a unicode string.
+ """
+ hdrlist = decode_header(parsed_message.get(name, ""))
+ btext, charset = hdrlist[0]
+ text = btext if isinstance(btext, str) else btext.decode("utf-8", "ignore")
+ return text.lstrip()
+
+
+def get_message_id(
+ parsed_message: Message, options_use_checksum=False, options_use_id_in_checksum=False
+) -> Optional[str]:
+ """
+ Normally, return the Message-ID header (or print a warning if it doesn't
+ exist and return None).
+
+ If options_use_checksum is specified, use md5 hash of several headers
+ instead.
+
+ For more safety, user should first do a dry run, reviewing them before
+ deletion. Problems are extremely unlikely, but md5 is not collision-free.
+
+ If options_use_id_in_checksum is specified, then the Message-ID will be
+ included in the header checksum, otherwise it is excluded.
+ """
+ try:
+ if options_use_checksum:
+ md5 = hashlib.md5()
+ sha = hashlib.sha256()
+ sha3 = hashlib.sha3_256()
+ def update(x):
+ md5.update(x)
+ sha.update(x)
+ sha3.update(x)
+ update(("From:" + str_header(parsed_message, "From")).encode())
+ update(("To:" + str_header(parsed_message, "To")).encode())
+ update(("Subject:" + str_header(parsed_message, "Subject")).encode())
+ update(("Date:" + str_header(parsed_message, "Date")).encode())
+ update(("Cc:" + str_header(parsed_message, "Cc")).encode())
+ update(("Bcc:" + str_header(parsed_message, "Bcc")).encode())
+ if options_use_id_in_checksum:
+ update(("Message-ID:" + str_header(parsed_message, "Message-ID")).encode())
+ msg_id = md5.hexdigest() + "|" + sha.hexdigest() + "|" + sha3.hexdigest()
+ # print(msg_id)
+ else:
+ msg_id = str_header(parsed_message, "Message-ID")
+ if not msg_id:
+ print(
+ (
+ "Message '%s' dated '%s' has no Message-ID header."
+ % (
+ str_header(parsed_message, "Subject"),
+ str_header(parsed_message, "Date"),
+ )
+ )
+ )
+ print("You might want to use the -c option.")
+ return None
+ return msg_id.lstrip()
+
+ except (ValueError, HeaderParseError):
+ print(
+ "WARNING: There was an exception trying to parse the headers of this message."
+ )
+ print("It may be corrupt, and you might consider deleting it.")
+ print(
+ (
+ "Subject: %s\nFrom: %s\nDate: %s\n"
+ % (
+ parsed_message["Subject"],
+ parsed_message["From"],
+ parsed_message["Date"],
+ )
+ )
+ )
+ print("Message skipped.")
+ return None
+
+
+def get_mailbox_list(server: imaplib.IMAP4, directory: str = '""', pattern: str = '"*"') -> List[str]:
+ """
+ Return a list of usable mailbox names which match the pattern.
+ """
+ resp = []
+ for mb in check_response(server.list(directory, pattern)):
+ if mb is None:
+ continue
+ bits = parse_list_response(mb)
+ if rb"\\Noselect" not in bits[0]:
+ resp.append(bits[2].decode())
+ return resp
+
+
+def get_matching_msgnums(server: imaplib.IMAP4, query: str, sent_before: Optional[str]) -> List[int]:
+ """
+ Return a list of ids of deleted messages in the folder.
+ """
+ resp = []
+ if (sent_before is not None):
+ query = query + " SENTBEFORE " + sent_before
+ print("Getting matching messages sent before " + sent_before)
+ deleted_info = check_response(server.search(None, query))
+ if deleted_info and deleted_info[0]:
+ # If neither None nor empty nor [None], then
+ # the first item should be a list of msg ids
+ resp = [int(n) for n in deleted_info[0].split()]
+ return resp
+
+def get_deleted_msgnums(server: imaplib.IMAP4, sent_before: Optional[str]) -> List[int]:
+ """
+ Return a list of ids of deleted messages in the folder.
+ """
+ return get_matching_msgnums(server, "DELETED", sent_before)
+
+def get_undeleted_msgnums(server: imaplib.IMAP4, sent_before: Optional[str]) -> List[int]:
+ """
+ Return a list of ids of non-deleted messages in the folder.
+ """
+ return get_matching_msgnums(server, "UNDELETED", sent_before)
+
+def get_tagged_msgnums(server: imaplib.IMAP4, sent_before: Optional[str]) -> List[int]:
+ """
+ Return a list of ids of tagged messages in the folder.
+ """
+ return get_matching_msgnums(server, "KEYWORD %s" % TAG_NAME, sent_before)
+
+def mark_messages_deleted(server: imaplib.IMAP4, msgs_to_delete: List[int], only_tag: bool):
+ message_ids = ",".join(map(str, msgs_to_delete))
+ action = TAG_NAME if only_tag else r"(\Deleted)"
+ check_response(
+ server.store(message_ids, "+FLAGS", action)
+ )
+
+def get_msg_headers(server: imaplib.IMAP4, msg_ids: List[int]) -> List[Tuple[int, bytes]]:
+ """
+ Get the dict of headers for each message in the list of provided IDs.
+ Return a list of tuples: [ (msgid, header_bytes), (msgid, header_bytes)... ]
+ The returned header_bytes can be parsed by
+ """
+ # Get the header info for each message
+ message_ids_str = ",".join(map(str, msg_ids))
+ ms = check_response(server.fetch(message_ids_str, "(RFC822.HEADER)"))
+
+ # There are two lines per message in the response
+ resp: List[Tuple[int, bytes]] = []
+ for ci in range(len(ms) // 2):
+ mnum = int(msg_ids[ci])
+ _, hinfo = ms[ci * 2]
+ resp.append((mnum, hinfo))
+ return resp
+
+
+def print_message_info(parsed_message: Message):
+ print("From: " + str_header(parsed_message, "From"))
+ print("To: " + str_header(parsed_message, "To"))
+ print("Cc: " + str_header(parsed_message, "Cc"))
+ print("Bcc: " + str_header(parsed_message, "Bcc"))
+ print("Subject: " + str_header(parsed_message, "Subject"))
+ print("Date: " + str_header(parsed_message, "Date"))
+ print("")
+
+
+def add_quotes(mbox: str) -> str:
+ if " " in mbox and (mbox[0] != '"' or mbox[-1] != '"'):
+ mbox = '"' + mbox + '"'
+ return mbox
+
+
+# This actually does the work
+def process(options, mboxes: List[str]):
+ serverclass: Type[Any]
+ if options.process:
+ serverclass = imaplib.IMAP4_stream
+ elif options.ssl:
+ serverclass = imaplib.IMAP4_SSL
+ else:
+ serverclass = imaplib.IMAP4
+
+ try:
+ if options.process:
+ server = serverclass(options.process)
+ elif options.port:
+ server = serverclass(options.server, options.port)
+ else:
+ # Use the default, which will be different depending on SSL choice
+ server = serverclass(options.server)
+ except socket.error as e:
+ sys.stderr.write(
+ "\nFailed to connect to server. Might be host, port or SSL settings?\n"
+ )
+ sys.stderr.write("%s\n\n" % e)
+ sys.exit(1)
+
+ # server.debug = 4 # If you want to see what's going on
+
+ if ("STARTTLS" in server.capabilities) and hasattr(server, "starttls"):
+ server.starttls()
+ elif options.starttls:
+ sys.stderr.write("\nError: Server did not offer TLS\n")
+ sys.exit(1)
+ elif not options.ssl:
+ sys.stderr.write("\nWarning: Unencrypted connection\n")
+
+ try:
+ if not options.process:
+ server.login(options.user, options.password)
+ except:
+ sys.stderr.write("\nError: Login failed\n")
+ sys.exit(1)
+
+ # List mailboxes option
+ # Just do that and then exit
+ if options.just_list:
+ for mb in get_mailbox_list(server):
+ print(mb)
+ return
+
+ if len(mboxes) == 0:
+ sys.stderr.write("\nError: Must specify mailbox\n")
+ sys.exit(1)
+
+ # Recursive option
+ # Add child mailboxes to mboxes
+ if options.recursive:
+ # Make sure mailbox name is surrounded by quotes if it contains a space
+ parent = add_quotes(mboxes[0])
+ # Fetch the hierarchy delimiter
+ bits = parse_list_response(check_response(server.list(parent, '""'))[0])
+ delimiter = bits[1].decode()
+ pattern='"' + delimiter + '*"'
+ for mb in get_mailbox_list(server, parent, pattern):
+ mboxes.append(mb)
+ print("Working recursively from mailbox %s. There are %d total mailboxes." % (parent, len(mboxes)))
+
+ if options.reverse:
+ mboxes.reverse()
+
+ if len(mboxes) > 1:
+ print("Working with mailboxes in order: %s" % (", ".join(mboxes)))
+
+ # OK - let's get started.
+ # Iterate through a set of named mailboxes and delete the later messages discovered.
+ try:
+ parser = BytesParser() # can be the same for all mailboxes
+ # Create a list of previously seen message IDs, in any mailbox
+ msg_ids: Dict[str, str] = {}
+ for mbox in mboxes:
+ msgs_to_delete = [] # should be reset for each mbox
+ msg_map = {} # should be reset for each mbox
+
+ # Make sure mailbox name is surrounded by quotes if it contains a space
+ mbox = add_quotes(mbox)
+
+ # Select the mailbox
+ msgs = check_response(server.select(mailbox=mbox, readonly=options.dry_run))[0]
+ print("There are %d messages in %s." % (int(msgs), mbox))
+
+ # Check how many messages are already marked 'deleted'...
+ numdeleted = len(get_deleted_msgnums(server, options.sent_before))
+ print(
+ "%s message(s) currently marked as deleted in %s"
+ % (numdeleted or "No", mbox)
+ )
+
+ # Now get a list of the ones that aren't deleted.
+ # That's what we'll actually use.
+ msgnums = get_undeleted_msgnums(server, options.sent_before)
+ print("%s others in %s" % (len(msgnums), mbox))
+
+ chunkSize = 100
+ if options.verbose:
+ print("Reading the others... (in batches of %d)" % chunkSize)
+
+ for i in range(0, len(msgnums), chunkSize):
+ if options.verbose:
+ print("Batch starting at item %d" % i)
+
+ # and parse them.
+ for mnum, hinfo in get_msg_headers(server, msgnums[i: i + chunkSize]):
+ # Parse the header info into a Message object
+ mp = parser.parsebytes(hinfo)
+
+ if options.verbose:
+ print("Checking %s message %s" % (mbox, mnum))
+ # Store message only when verbose is enabled (to print it later on)
+ msg_map[mnum] = mp
+
+ # Record the message-ID header (or generate one from other headers)
+ msg_id = get_message_id(
+ mp, options.use_checksum, options.use_id_in_checksum
+ )
+
+ if msg_id:
+ # If we've seen this message before, record it as one to be
+ # deleted in this mailbox.
+ if msg_id in msg_ids:
+ print(
+ "Message %s_%s is a duplicate of %s and %s be %s"
+ % (
+ mbox, mnum, msg_ids[msg_id],
+ options.dry_run and "would" or "will",
+ "tagged as '%s'" % TAG_NAME if options.only_tag else "marked as deleted",
+ )
+ )
+ if options.show or options.verbose:
+ print(
+ "Subject: %s\nFrom: %s\nDate: %s\n"
+ % (mp["Subject"], mp["From"], mp["Date"])
+ )
+ msgs_to_delete.append(mnum)
+ # Otherwise just record the fact that we've seen it
+ else:
+ msg_ids[msg_id] = f"{mbox}_{mnum}"
+
+ print(
+ (
+ "%s message(s) in %s processed"
+ % (min(len(msgnums), i + chunkSize), mbox)
+ )
+ )
+
+ # OK - we've been through this mailbox, and msgs_to_delete holds
+ # a list of the duplicates we've found.
+
+ if len(msgs_to_delete) == 0:
+ print("No duplicates were found in %s" % mbox)
+
+ else:
+ if options.verbose:
+ print("These are the duplicate messages: ")
+ for mnum in msgs_to_delete:
+ print_message_info(msg_map[mnum])
+
+ if options.dry_run:
+ print(
+ "If you had NOT selected the 'dry-run' option,\n"
+ " %i messages would now be %s."
+ % (
+ len(msgs_to_delete),
+ "tagged as '%s'" % TAG_NAME if options.only_tag else "marked as deleted",
+ )
+ )
+
+ else:
+ if options.only_tag:
+ print("Tagging %i messages as '%s'..." % (len(msgs_to_delete), TAG_NAME))
+ else:
+ print("Marking %i messages as deleted..." % (len(msgs_to_delete)))
+ # Deleting messages one at a time can be slow if there are many,
+ # so we batch them up.
+ chunkSize = 30
+ if options.verbose:
+ print("(in batches of %d)" % chunkSize)
+ for i in range(0, len(msgs_to_delete), chunkSize):
+ mark_messages_deleted(server, msgs_to_delete[i: i + chunkSize], options.only_tag)
+ if options.verbose:
+ print("Batch starting at item %d marked." % i)
+ print("Confirming new numbers...")
+ numdeleted = len(get_deleted_msgnums(server, options.sent_before))
+ numundel = len(get_undeleted_msgnums(server, options.sent_before))
+ print(
+ "There are now %s messages marked as deleted and %s others in %s."
+ % (numdeleted, numundel, mbox)
+ )
+ if options.only_tag:
+ numtagged = len(get_tagged_msgnums(server, options.sent_before))
+ print(
+ "There are now %s messages tagged as '%s' in %s."
+ % (numtagged, TAG_NAME, mbox)
+ )
+
+ if not options.no_close:
+ server.close()
+
+ except ImapDedupException as e:
+ print("Error:", e, file=sys.stderr)
+ finally:
+ server.logout()
+
+if __name__ == "__main__":
+ options, mboxes = get_arguments()
+ process(options, mboxes)