#!/bin/sh # # tool to train both spam classifiers: # a) SpamAssasin's bayes filter # b) CRM114 # with input mails from the filter.good and filter.spam IMAP folders # # source folders (in IMAP) imap_spam="Spam.train_spam" imap_good="Spam.train_good" imap_user="mschuett" username="vscan" # working folders train_dir_good=/var/amavis/crm114/train_good train_dir_spam=/var/amavis/crm114/train_spam for dir in $train_dir_good $train_dir_spam; do if [ -d $dir ]; then rm -f ${dir}/* else mkdir $dir fi done # find files and link into training dirs echo -n "Linking Spam msgs: " doveadm search -u $imap_user mailbox $imap_spam | while read guid uid; do cacheid=`doveadm fetch -u $imap_user hdr mailbox-guid $guid uid $uid | fgrep X-CRM114-CacheID: | head -1 | sed -e 's/^X-CRM114-CacheID: sfid-\(2[^ ]*\) *$/\1/'` file=/var/amavis/crm114/reaver_cache/texts/$cacheid if [ -f $file ]; then ln -f $file $train_dir_spam && echo -n . # echo S $cacheid else #echo "warning: cannot find mail with guid '$guid' uid '$uid' and CacheID '$cacheid'" tempfoo=`mktemp $train_dir_spam/mail.${uid}.XXXXXXX` if [ $? -ne 0 ]; then echo "$0: Can't create temp file, exiting..." exit 1 fi #echo "warning: cannot link mail with guid '$guid' uid '$uid' and CacheID '$cacheid' --> create $tempfoo" echo -n o doveadm fetch -u $imap_user text mailbox-guid $guid uid $uid \ | sed -e '1d' -e '$d' > $tempfoo fi done echo " " echo -n "Linking Ham msgs: " doveadm search -u $imap_user mailbox $imap_good | while read guid uid; do cacheid=`doveadm fetch -u $imap_user hdr mailbox-guid $guid uid $uid | fgrep X-CRM114-CacheID: | head -1 | sed -e 's/^X-CRM114-CacheID: sfid-\(2[^ ]*\) *$/\1/'` file=/var/amavis/crm114/reaver_cache/texts/$cacheid if [ -f $file ]; then ln -f $file $train_dir_good && echo -n . #echo G $cacheid else #echo "warning: cannot find mail with guid '$guid' uid '$uid' and CacheID '$cacheid'" tempfoo=`mktemp $train_dir_good/mail.${uid}.XXXXXXX` if [ $? -ne 0 ]; then echo "$0: Can't create temp file, exiting..." exit 1 fi echo -n o #echo "warning: cannot link mail with guid '$guid' uid '$uid' and CacheID '$cacheid' --> create $tempfoo" doveadm fetch -u $imap_user text mailbox-guid $guid uid $uid \ | sed -e '1d' -e '$d' > $tempfoo fi done echo " " # ensure no mail is in both training folders for goodfile in $train_dir_good/*; do cacheid=`basename $goodfile` if [ -e $train_dir_spam/$cacheid ]; then echo "Same file in CRM spam-dir and good-dir: $cacheid" echo "E-Mail info:" egrep '^(Date|From|To): ' $goodfile echo exit 1 fi done echo linked files. chown -R ${username}:${username} $train_dir_spam $train_dir_good echo "run sa-learn --spam:" su ${username} -c "sa-learn --local --progress --spam $train_dir_spam" echo "run sa-learn --ham:" su ${username} -c "sa-learn --local --progress --ham $train_dir_good" # CRM114 mailtrainer.crm cd /var/amavis/crm114 echo "run mailtrainer.crm:" su ${username} -c "crm -u /var/amavis/crm114 mailtrainer.crm --spam=$train_dir_spam/ --good=$train_dir_good/ --repeat=2 --random --worst=3 --thick=10" echo echo "clean dirs..." rm $train_dir_good/* $train_dir_spam/*