#!/usr/local/bin/python """Usage: finddups [--dupfile ] [--dupcount ] The mail message on standard input is parsed, and an md5 digest of the headers named in the argument list is computed. This is then checked against a list of digests from previously seen messages. If a match is found, finddups exits with a result of 1. If no match is found, the digest is added to the saved list and finddups exits with a result of 0. The flags are --dupfile (defaults to $HOME/.finddups-digests), --dupcount (defaults to 500) and --noisy. dupfile species where the duplicates are kept. dupcount specifies how many duplicates to keep. noisy - if present - will cause finddups to announce whether or not this is a duplicate message. All other arguments indicate strings to be added to the digest. The special header names "body" and "headers" indicate the text of the body, and the text of all the headers (received lines, included.), respectively. All other argument names are the names mail headers to be added to the digest.""" import os, sys, getopt, os, md5, mailwrapper, cPickle, string, fcntl, FCNTL # # Argument processing happens here # count = None filename = None verbose = None options, headers = getopt.getopt(sys.argv[1:], '', ['dupfile=', 'dupcount=', 'noisy']) for name, value in options: if name == '--noisy': verbose = 1 elif name == '--dupfile': filename = value elif name == '--dupcount': try: count = string.atoi(value) except ValueError: sys.stderr.write('%s: Invalid value for dupcount: %s\n' % (sys.argv[0], value)) else: sys.stderr.write('%s: Unrecognized flag: %s.\n' % (sys.argv[0], value)) count = count or 500 # The Default value is set here. try: filename = filename or os.path.join(os.environ['HOME'], '.finddups-digests') except KeyError: sys.stderr.write("%s: You must either set $HOME or provide a --dupfile option!\n" % sys.argv[1]) sys.exit(1) # # Now take this messages fingerprint. # text = mailwrapper.selector(headers).get_headers() digest = md5.new() map(digest.update, text) fingerprint = digest.digest() # # Finally, check (and maybe update) our records. # try: file = open(filename, 'r+') file.seek(0) except IOError: file = open(filename, 'w+') fcntl.flock(file.fileno(), FCNTL.LOCK_EX) try: digests = cPickle.load(file) except EOFError: digests = [] # Time for the moment of truth - is it a duplicate or not? if fingerprint in digests: print "Duplcate mail" fcntl.flock(file.fileno(), FCNTL.LOCK_UN) file.close() sys.exit(0) # Guess not... digests.append(fingerprint) overage = len(digests) - count if overage > 0: digests = digests[overage:] file.seek(0) cPickle.dump(digests, file, 0) # 0 for debugging, 1 for production file.flush() fcntl.flock(file.fileno(), FCNTL.LOCK_UN) file.close() if verbose: print "Original mail." sys.exit(1)