#! /usr/local/bin/python # -*- coding: utf-8 -*- # # Py-MailListStat 1.0 # Program for generating various statistics on emails # # This program is based on/is a reimplementation of # MailListStat by Marek Podmaka (http://www.marki-online.net/MLS/) # ############################################################################### # # Copyright (c) 2010, Martin Schuette # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ############################################################################### # # TODOs: # - add number of missing values to every output # - use sys.write() instead of print to eliminate unintended spaces and write to file # - count quote ratios # import sys import mailbox import email import email.header import time import operator import re from optparse import OptionParser # variables are global sender_msg_count = {} sender_total_size = {} subject_count = {} client_count = {} total_msg_count = 0 total_size = 0 first_date = time.time() last_date = 0.0 hour_count = 24*[0] mday_count = 32*[0] # no day 0 wday_count = 7*[0] mon_count = 13*[0] # no month 0 year_count = {} # special case: dictionary ### aux parsing functions def get_client_name(msg): if msg['User-Agent']: return msg['User-Agent'] elif msg['X-Mailer']: return msg['X-Mailer'] elif msg['X-Newsreader']: return msg['X-Newsreader'] elif msg['Message-ID'].startswith(' last_date: last_date = ts d = time.localtime(ts) if d: hour_count[d[3]] += 1 mday_count[d[2]] += 1 mon_count[d[1]] += 1 wday_count[d[6]] += 1 year_count[d[0]] = year_count.setdefault(d[0], 0) + 1 # read mailbox, count def read_mailbox(filename): global sender_msg_count, sender_total_size, subject_count, client_count global total_msg_count, total_size for msg in mailbox.mbox(filename): total_msg_count += 1 # in multipart messages count only 1st part if msg.is_multipart(): total_size += len(msg.get_payload()[0]) else: total_size += len(msg.get_payload()) f = email.utils.parseaddr(msg['from'])[1] sender_msg_count[f] = sender_msg_count.setdefault(f, 0) + 1 sender_total_size[f] = sender_total_size.setdefault(f, 0) + len(msg.get_payload()) subject_count[msg['subject']] = subject_count.setdefault(msg['subject'], 0) + 1 client = get_client_name(msg) client_count[client] = client_count.setdefault(client, 0) + 1 get_msg_date(msg) ### Subjects and Clients are normalized, ### i.e. Re:-prefixes and minor version numbers are stripped (when possible) def normalize_subjects(subject_count): re_obj = re.compile('((?:Re|Aw|Fwd):\s+)*', re.IGNORECASE) white_obj = re.compile('(\s)') new_subject_count = {} for i in subject_count.keys(): if i == None: continue # remove Re: and similar prefixes m = re_obj.sub('', i) # replace whitespaces with normal spaces m = white_obj.sub(' ', i) # decode non-ascii charsets # somewhat ugly, because MM adds its list prefix in front of encoded header dec = email.header.decode_header(m) n = "" for word in dec: if word[1]: n += " " + unicode(word[0], word[1]) else: n += " " + unicode(word[0]) n = re_obj.sub('', n) n = n.lstrip() new_subject_count[n] = new_subject_count.setdefault(n, 0) + subject_count[i] return new_subject_count def normalize_clients(client_count): new_client_count = {} for i in client_count.keys(): n = i[:] # remove parentheses while n.find('(') >= 0: start = n.find('(') end = n.find(')', start) n = n[:start-1] + n[end+1:] # shorten version a.b.c or a.b.c.d to a.b m = re.compile('(^.*[\s/]\d+\.\d+)(\.\d+\D?)*( .*)?$').search(n) if m: n = m.group(1) if m.group(3): n += m.group(3) new_client_count[n] = new_client_count.setdefault(n, 0) + client_count[i] return new_client_count ### output functions def output_popular_clients(client_count, total_msg_count, top=10): # use a copy to safely delete the Unknown-data new_client_count = client_count.copy() unknowns = new_client_count.setdefault('Unknown', 0) del new_client_count['Unknown'] top_senders = sorted(new_client_count.items(), key=operator.itemgetter(1)) top_senders.reverse() nontop_senders = total_msg_count - sum([x[1] for x in top_senders[:top]]) print "***** Most used email clients:" print "+----+----Mailer------------------------------------+--Msg-+-Percent-+" for i in enumerate(top_senders[:top]): rank = i[0]+1 entry = i[1][0][:45] num = i[1][1] pct = 100.0*num/total_msg_count print "| %2d | %-45s| %4d |%6.2f %% |" % (rank, entry, num, pct) print "+----+----------------------------------------------+------+---------+" entry = "other" num = nontop_senders pct = 100.0*num/total_msg_count print "| | %-45s| %4d |%6.2f %% |" % (entry, num, pct) entry = "not set" num = unknowns pct = 100.0*num/total_msg_count print "| | %-45s| %4d |%6.2f %% |" % (entry, num, pct) print "+----+----------------------------------------------+------+---------+" print print def output_popular_subjects(subject_count, total_msg_count, top=10): # output encoding necessary :-/ out_enc = sys.stdout.encoding if out_enc == None: out_enc = 'ascii' top_subjects = sorted(subject_count.items(), key=operator.itemgetter(1)) top_subjects.reverse() print "***** Table showing the most successful subjects:" print "+----+-----Author-----------------------------------+--Msg-+-Percent-+" for i in enumerate(top_subjects[:top]): rank = i[0]+1 entry = i[1][0][:45].encode(out_enc,'replace') num = i[1][1] pct = 100.0*num/total_msg_count print "| %2d | %-45s| %4d |%6.2f %% |" % (rank, entry, num, pct) print "+----+----------------------------------------------+------+---------+" nontop_subjects = total_msg_count - sum([x[1] for x in top_subjects[:top]]) entry = "other" num = nontop_subjects pct = 100.0*num/total_msg_count print "| | %-45s| %4d |%6.2f %% |" % (entry, num, pct) print "+----+----------------------------------------------+------+---------+" print print def output_most_messages(sender_msg_count, total_msg_count, top=10): top_senders = sorted(sender_msg_count.items(), key=operator.itemgetter(1)) top_senders.reverse() print "***** People who have written most messages:" print "+----+-----Author-----------------------------------+--Msg-+-Percent-+" for i in enumerate(top_senders[:top]): rank = i[0]+1 entry = i[1][0][:45] num = i[1][1] pct = 100.0*num/total_msg_count print "| %2d | %-45s| %4d |%6.2f %% |" % (rank, entry, num, pct) print "+----+----------------------------------------------+------+---------+" nontop_senders = total_msg_count - sum([x[1] for x in top_senders[:top]]) entry = "other" num = nontop_senders pct = 100.0*num/total_msg_count print "| | %-45s| %4d |%6.2f %% |" % (entry, num, pct) print "+----+----------------------------------------------+------+---------+" print print def output_most_kbytes(sender_total_size, total_size, top=10): top_senders = sorted(sender_total_size.items(), key=operator.itemgetter(1)) top_senders.reverse() print "***** Best authors, by total size of their messages:" print "+----+-----Author---------------------------------+-KBytes-+-Percent-+" for i in enumerate(top_senders[:top]): rank = i[0]+1 entry = i[1][0][:40] num = i[1][1]/1024.0 pct = 100.0*i[1][1]/total_size print "| %2d | %-40s | %6.1f |%6.2f %% |" % (rank, entry, num, pct) print "+----+--------------------------------------------+--------+---------+" nontop_senders = total_size - sum([x[1] for x in top_senders[:top]]) entry = "others" num = nontop_senders/1024.0 pct = 100.0*nontop_senders/total_size print "| | %-40s | %6.1f |%6.2f %% |" % (entry, num, pct) print "+----+--------------------------------------------+--------+---------+" print print def output_biggest_avg_messages(sender_total_size, sender_msg_count, total_size, total_msg_count, top=10): avg_size = {} for s in sender_total_size.keys(): avg_size[s] = 1.0*sender_total_size[s]/sender_msg_count[s] top_senders = sorted(avg_size.items(), key=operator.itemgetter(1)) top_senders.reverse() print "***** Best authors, by average size of their messages:" print "+----+-----Author-------------------------------------------+--bytes-+" for i in enumerate(top_senders[:top]): rank = i[0]+1 entry = i[1][0][:45] num = i[1][1] print "| %2d | %-50s | %6d |" % (rank, entry, num) print "+----+------------------------------------------------------+--------+" entry = "average msg size" num = 1.0*total_size/total_msg_count print "| | %-50s | %6d |" % (entry, num) print "+----+------------------------------------------------------+--------+" print print ### function to print barcharts for all datetime values ### wrapper functions below have to provide list of values and labels def output_time(time_count, label="hours of day", axislabel="hour", axisticks=None): # use wide and narrow column symbols depending on dataset if len(time_count) < 20: hit = '-#-' miss = '---' tick = ' * ' else: hit = '#' miss = '-' tick = '*' # default for axisticks if axisticks == None: axisticks = len(time_count)*[" "] for i in range(0,len(time_count)-1,5): # yes, it's ugly axisticks[i] = i for j in range(len(tick), len(str(i))): axisticks[i+j] = "" axisticks[len(time_count)-1] = len(time_count)-1 cnt_max = max(time_count) pct = [100.0*i/cnt_max for i in time_count] print "***** Graph showing number of messages written during %s:" % (label,) print for i in range(10,0,-1): print " %2d0%%" % (i,), for j in range(len(time_count)): if pct[j] >= i*10: print hit, else: print miss, if i == 10: print " - " + str(cnt_max)+" msgs", print print " ", for j in range(len(time_count)): print tick, print print "%-5s" % (axislabel[:5],), for i in axisticks: print i.rjust(len(tick)), print print def output_hours(hour_count): axisticks = ['0', ' ', ' ', ' ', ' ', ' ', '6', ' ', ' ', ' ', ' ', ' ', \ '12', '', ' ', ' ', ' ', '17', '', ' ', ' ', ' ', ' ', '23'] output_time(hour_count, "hours of day", "hour", axisticks) def output_mdays(mday_count): axisticks = 32*[" "] for i in [1,7]: axisticks[i] = i for i in [14,21,28]: axisticks[i] = i axisticks[i+1] = "" output_time(mday_count, "days of month", "day", axisticks) def output_years(year_count): # skip if only one year if len(year_count.keys()) <= 1: return years = year_count.keys() years.sort() axisticks = [str(i)[2:4] for i in years] output_time([year_count[i] for i in years], "years", "year", axisticks) def output_mdays(mon_count): axisticks = [' ', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] output_time(mon_count, "months of year", "month", axisticks) def output_wdays(wday_count): # cycle list to start with Mon: new_wday_count = wday_count[1:] + [wday_count[0]] axisticks = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] output_time(new_wday_count, "days of week", "wday", axisticks) def output_summary(total_msg_count, total_size, sender_msg_count, norm_subject_count, first_date, last_date): print """***** Summary: First message from: %s Last message from: %s Total number of messages: %10d Total number of different authors: %10d Total number of different subjects: %10d Total size of messages (w/o headers): %10.1f Kbytes Average size of a message: %10d bytes""" % \ (time.ctime(first_date), time.ctime(last_date), total_msg_count, len(sender_msg_count.keys()), len(norm_subject_count.keys()), total_size/1024.0, int(1.0*total_size/total_msg_count)) print print if __name__ == '__main__': # handle options parser = OptionParser(version="%prog 1.0", description="print statistics on email messages") parser.add_option("-i", "--input", dest="filename", help="read mbox", metavar="FILE") #parser.add_option("-o", "--output", dest="outfile", help="write summary to", metavar="FILE") parser.add_option("-n", type="int", dest="top", help="print TOP XX authors, subjects and quoting", metavar="XX", default=10) (options, args) = parser.parse_args() if not options.filename: parser.print_help() # read and normalize read_mailbox(options.filename) norm_subject_count = normalize_subjects(subject_count) norm_client_count = normalize_clients(client_count) # output statistics output_summary(total_msg_count, total_size, sender_msg_count, norm_subject_count, first_date, last_date) output_most_messages(sender_msg_count, total_msg_count, options.top) output_most_kbytes(sender_total_size, total_size, options.top) output_biggest_avg_messages(sender_total_size, sender_msg_count, total_size, total_msg_count, options.top) output_popular_subjects(norm_subject_count, total_msg_count, options.top) output_popular_clients(norm_client_count, total_msg_count, options.top) output_hours(hour_count) output_mdays(mday_count) output_wdays(wday_count) output_years(year_count) print