#!/usr/bin/perl # # baywatch # # A simple Bayesian spam filter # by Michael J. Fromberger # # To test a message, pipe to `baywatch --test --return' # To get command-line help, run `baywatch --help' # # $Id: baywatch,v 1.8 2002/09/11 21:14:43 sting Exp $ # use Getopt::Long; use DB_File; use Fcntl; @_=split(/\//,$0);chomp($prog=pop(@_)); # Default probability that a message containing this word is spam $default_spam_prob = 0.4; # Where to look for the user's word database (%h == home dir) $word_db_path = '%h/.worddb'; # How many interesting words we'll bother considering $interest_threshold = 15; # How far a probability estimate has to be from 0.5 to be considered $interest_band = 0.09; # Probability inset (to prevent division by zero) $spam_epsilon = 0.01; # Default probability cutoff for messages to be classified as spam $spam_threshold = 0.9; # How many characters a word must have at minimum to be interesting $word_min_chars = 3; # How many characters at most a word must have to be interesting $word_max_chars = 25; # Words with a high crap ratio are ignored from the input $word_crap_level = 0.7; # -------------------------------------------------------------------- # You should not need to edit anything below this line # Operator precedence table for filter expressions # Don't stare too hard at this if you want to remain sane. $prec_tab = 'e()&|$ ??>>>> <<=<>>> <<>>>> <<><>> < 0) { $spam_threshold = $opt{'return'}; } if(exists $opt{'filter'}) { $filter_expr = parse_filter_expr($opt{'filter'}, $prec_tab); if(ref($filter_expr) ne "ARRAY") { printf STDERR ("%s in filter expression.\n", $filter_expr); exit(1); } elsif($opt{'display'} || $opt{'words'}) { print STDERR "Filter expression: "; print_filter_expr($filter_expr); print STDERR "\n"; } } # Rule out mutually exclusive options if(exists($opt{'test'}) && (exists($opt{'words'}) || exists($opt{'good'}) || exists($opt{'spam'}))) { print STDERR ("The `test' and `words/good/spam' options are mutually ", "exclusive.\n"); exit(1); } elsif(exists($opt{'words'}) && (exists($opt{'test'}) || exists($opt{'good'}) || exists($opt{'spam'}))) { print STDERR ("The `words' and `test/good/spam' options are ", "mutually exclusive.\n"); exit(1); } # Read in existing database of words if($opt{'test'}) { $word_db = read_word_table($word_db_path, "r"); } else { $word_db = read_word_table($word_db_path, "rw"); } # Get message counts, or create them. The spaces in these names are # there on purpose; no "real" words will contain spaces so these are # safe meta-keys. unless(exists($word_db->{"# good"})) { $word_db->{"# good"} = 0; } $num_good_msgs = $word_db->{"# good"}; unless(exists($word_db->{"# bad"})) { $word_db->{"# bad"} = 0; } $num_spam_msgs = $word_db->{"# bad"}; $num_total_msgs = $num_good_msgs + $num_spam_msgs; # If requested, flush all old entries. Doing so doesn't end the # program, so we'll continue with whatever else the user asked for # after doing this. if($opt{'flush'} || $opt{'dbimport'}) { printf STDERR ("Flushing all entries from `%s'\n", $word_db_path); foreach $word (keys(%$word_db)) { delete($word_db->{$word}); } } if($opt{'dbimport'}) { my $path = $opt{'dbimport'}; my $num_words; printf STDERR ("Importing database from %s\n", ($path eq "-" ? "standard input" : "`$path'")); open(IFP, $path) or die "$prog: unable to open import file $path: $!\n"; while() { chomp; my($key, $val) = split(/\t/, $_); next unless($key && $val); $word_db->{$key} = $val; ++$num_words; } close(IFP); printf STDERR (" -- ok, imported %d records\n", $num_words); } # If the user asked for word statistics, give it to them, and bail. if($opt{'words'}) { my(@words); printf STDERR ("%d words in database\n%d messages: %d good, %d bad\n", scalar(keys(%$word_db)), $num_total_msgs, $num_good_msgs, $num_spam_msgs); if($opt{'delete'}) { print STDERR ("Selected words will be deleted.\n"); } printf STDERR ("%24s %5s %5s %5s %5s %5s %5s %5s %5s %5s\n", "WORD", "FREQ", "GOOD", "BAD", "AVG", "G-AVE", "B-AVG", "CRAP", "RPROB", "SPROB"); if(@ARGV == 0) { @words = sort {$a cmp $b} keys(%$word_db); } else { @words = @ARGV; } foreach $word (map lc, @words) { next if($word =~ / /o); my $info = get_word_info($word_db, $word); # If specified, only include words in the filter spec if($filter_expr) { next unless match_filter_expr($info, $filter_expr); } printf("%24s %5d %5d %5d %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n", $word, $info->{"freq"}, $info->{"good"}, $info->{"bad"}, $info->{"average"}, $info->{"good_average"}, $info->{"bad_average"}, $info->{"crap"}, $info->{"rprob"}, $info->{"sprob"}); if($opt{'delete'}) { delete($word_db->{$word}); } } exit(0); } # Here, we test a message for spammitude if($opt{'test'}) { my $headers; my $interest = $interest_threshold; if($opt{'display'}) { printf STDERR ("Interest threshold: %d\n" . "Interest bandwidth: %.3f\n" . "Probability inset: %.3f\n" . "Frequency scaling: %s\n" . "Spam threshold: %.3f\n" . "Novelty bias: %.3f\n\n", $interest_threshold, $interest_band, $spam_epsilon, ($opt{'normalize'} ? "yes" : "no"), $spam_threshold, $default_spam_prob); } # Unless requested to do plain text, throw away mail headers unless($opt{'plain'}) { while(<>) { chomp; last if /^$/o; $headers .= $_ . "\n"; } } # Suck up all the rest of the available lines @lines = <>; # Compute word statistics $info = parse_text_message(@lines); # Look up probabilities in the word database foreach $word (keys(%$info)) { my $sp = spam_prob($word_db, $word, $opt{'normalize'}); if(abs($sp - 0.5) > $interest_band) { $info->{$word} = $sp; } else { delete($info->{$word}); } } # See `bayes-spam.pdf' for a description of how this works. $p1 = 1.0; $p2 = 1.0; foreach $word (sort {abs($info->{$b} - 0.5) <=> abs($info->{$a} - 0.5)} keys(%$info)) { my $sp = $info->{$word} + 0; if($opt{'display'}) { printf STDERR ("%24s %.4f\n", $word, $sp); } $p1 *= $sp; $p2 *= (1.0 - $sp); last if --$interest == 0; } $prob = $p1 / ($p1 + $p2); # If they asked us to return an exit status, return 0 if the # message is okay, 1 if it's spam. Otherwise, print out the # probability we computed and all's well. if($opt{'log'}) { printf STDERR ("%s [%d/%d] SP=%0.3f ST=%0.3f AV=%s IT=%d EP=%0.2f\n" . "\tFrom: %s\n", $prog, $$, $<, $prob, $spam_threshold, ($opt{'normalize'} ? "yes" : "no"), $interest_threshold, $spam_epsilon, ($headers =~ /^From:.*?([-\w.]+@[-a-z0-9.]+)/im ? $1 : "??")); } if(exists($opt{'return'})) { my $rtn = ($prob > $spam_threshold) ? 1 : 0; if($opt{'display'}) { printf STDERR ("Input is %s (%d/%.3f)\n", ($rtn ? "spam" : "good"), $rtn, $prob); } exit($rtn); } printf("%0.6f\n", $prob); exit(0); } # So, if we got here, it's likely that the user asked to import some # messages. Or maybe to forget or reclassify. # # This is a nice tangly little web of conditionals, all different, but # efficiency is not really a priority down this branch, since this is # not getting run from procmail or anything like that. Hopefully. # The default action is "learn", which means to soak up all the words # you find and add them to the chosen side. "Unlearn" means you want # to take all these words OUT of the chosen side. "Exchange" means # you want to move them from one side (the side specified on the # command line) to the other. $act = "learn"; if($opt{'unlearn'}) { $act = "unlearn"; } elsif($opt{'exchange'}) { $act = "switch"; } # Okay, do any pending `good' messages... if(exists($opt{'good'})) { foreach $file (@{$opt{'good'}}) { printf STDERR ("%sing good messages from `%s'\n", ucfirst($act), $file); if(read_mail_file($file, $word_db, "good", $act)) { print STDERR (" -- ok, messages processed\n"); } else { printf STDERR (" ** error processing messages: %s\n", $!); exit(1); } } } # And, any pending `bad' messages... if(exists($opt{'spam'})) { foreach $file (@{$opt{'spam'}}) { printf STDERR ("%sing spam messages from `%s'\n", ucfirst($act), $file); if(read_mail_file($file, $word_db, "bad", $act)) { print STDERR (" -- ok, messages processed\n"); } else { printf STDERR (" ** error processing messages: %s\n", $!); exit(1); } } } # If requested, export the database to a flat text file if($opt{'dbexport'}) { my $path = $opt{'dbexport'} || "-"; my($k, $v); open(OFP, ">$path") or die "$prog: unable to open export file $path: $!\n"; printf STDERR ("Exporting database to %s\n", ($path eq "-" ? "standard output" : "`$path'")); printf OFP ("# good\t%d\n# bad\t%d\n", $word_db->{"# good"}, $word_db->{"# bad"}); while(($k, $v) = each(%$word_db)) { next if($k eq "# good" || $k eq "# bad"); printf OFP ("%s\t%s\n", $k, $v); } if(close(OFP)) { print STDERR (" -- ok, database exported\n"); } else { print STDERR (" ** error exporting database: $!\n"); } } # This really doesn't need to be a function; it's a holdover from the # early versions which used a flat text file as the database. write_word_table($word_db, $word_db_path); exit(0); # {{{ read_mail_file(path, table, kind) # Here's where we soak up new messages. We assume we're reading a # Unix mailbox file, with `From ' lines to delimit messages. The # headers are ignored, except the Subject. sub read_mail_file { my($path, $table, $kind, $act) = @_; my(@lines, $line, $num_msgs); open(IFP, $path) or return 0; for(;;) { @lines = (); while(defined($line = )) { chomp; last if $line =~ /^$/o; if($line =~ /^Subject:\s*(.+)$/) { push(@lines, $1); } } # Unescape any escaped `From ' lines while we're at it while(defined($line = ) && $line !~ /^From /o) { chomp; $line =~ s/^>From /From /; push(@lines, $line); } ++$num_msgs; my $w = parse_text_message(@lines); my $word; foreach $word (keys(%$w)) { my($good, $bad); if(exists($table->{$word})) { ($good, $bad) = split(/:/, $table->{$word}); } # If we're unlearning or reclassifying, subtract the count # from the inbound side if($act eq "unlearn" || $act eq "switch") { if($kind eq "good") { $good -= $w->{$word}; $good = 0 if $good < 0; } else { $bad -= $w->{$word}; $bad = 0 if $bad < 0; } } # If we're learning, add the count to the inbound side; if # we're reclassifying, add it to the outbound side. if($act eq "learn") { if($kind eq "good") { $good += $w->{$word}; } else { $bad += $w->{$word}; } } elsif($act eq "switch") { if($kind eq "good") { $bad += $w->{$word}; } else { $good += $w->{$word}; } } if($good || $bad) { $w->{$word} = sprintf("%d:%d", $good, $bad); } else { delete($w->{$word}); delete($table->{$word}); } } foreach $word (keys(%$w)) { $table->{$word} = $w->{$word}; } last unless defined($line); } # If we are either unlearning or reclassifying, subtract the # messages just unlearned from the appropriate side. If we are # reclassifying, add them to the other side. if($act eq "unlearn" || $act eq "switch") { my($new_count) = $table->{"# $kind"} - $num_msgs; if($new_count < 0) { $new_count = 0; } $table->{"# $kind"} = $new_count; if($act eq "switch") { my($other_kind) = ($kind eq "good") ? "bad" : "good"; $table->{"# $other_kind"} += $num_msgs; } } else { $table->{"# $kind"} += $num_msgs; } close(IFP); } # }}} # {{{ parse_text_message(lines) # Here's where we decide what constitutes a word. # Basically, it's any run of letters, digits, apostrophes, dollar # signs, and asterisks at least 3 characters long and no more than 25 # characters. The upper limit keeps you from sucking in a lot of # base64 encoded MIME attachments. We could be smarter about parsing # those MIME headers, but that's just silly. # # Lowercase everything, and return a hash mapping words to frequencies. sub parse_text_message { my(@lines) = @_; my($words, $line); $words = {}; foreach $line (@lines) { $line =~ s/^[\s>]+//; my(@words) = map lc, split(/[^A-Za-z0-9\'\$\*]+/, $line); foreach (@words) { next if(length($_) > $word_max_chars || length($_) < $word_min_chars || /^[0-9]/o); next unless(/[a-z]/o); next if word_is_crap($_); $words->{$_} = 0 unless exists($words->{$_}); $words->{$_} += 1; } } $words; } # }}} # {{{ word_is_crap(word) # A word is "crap" if the ratio of different characters in the word to # its length is too small. This is typical of "words" resulting from # the reading of base64 encoded attachments. sub word_crap_index { my($word) = shift; my(%cmap); map {$cmap{$_}=1} split(//, $word); 1 - (scalar(keys(%cmap)) / length($word)); } sub word_is_crap { word_crap_index(@_) > $word_crap_level; } # }}} # {{{ read_word_table(path) sub read_word_table { my($path, $mode) = @_; my $out = {}; my $rmode = ($mode =~ /w/i ? (O_RDWR|O_CREAT) : O_RDONLY); tie(%$out, 'DB_File', $path, $rmode, 0640) or do { printf STDERR ("%s [%d/%d] was unable to open %s: %s\n", $prog, $$, $<, $path, $!); exit(0); # Exit successfully }; return $out; } # }}} # {{{ write_word_table(table, path) sub write_word_table { my($table, $path) = @_; my($key); if(tied(%$table)) { return untie(%$table); } } # }}} # {{{ spam_prob(table, word) sub spam_prob { my($table, $word, $scale) = @_; if(exists($table->{$word})) { my($good, $bad) = split(/:/, $table->{$word}); if($bad == 0) { return (0 + $spam_epsilon); } elsif($good == 0) { return (1 - $spam_epsilon); } elsif($scale) { my($bbad) = ($bad / $num_spam_msgs); return $bbad / (($good / $num_good_msgs) + $bbad); } else { return $bad / ($good + $bad); } } else { return $default_spam_prob; } } # }}} # {{{ read_prec_table(tabstr) sub read_prec_table { my(@tab) = split(/\s+/, shift); my(@syms) = split(/\s*/, shift(@tab)); my(%out, $sym); foreach (@syms) { $out{$_} = {}; } foreach $sym (@syms) { my(@rel) = split(//, shift(@tab)); my $pos; for($pos = 0; $pos < @rel && $pos < @syms; ++$pos) { $out{$sym}->{$syms[$pos]} = $rel[$pos]; } } %out; } # }}} # {{{ parse_filter_expr(expr, prec_tab) sub parse_filter_expr { my($expr, $prec_tab) = @_; my(@args) = ( "\$" ); my($cur, $ctype, @ops); my %optab = read_prec_table($prec_tab); $ctype = "\$" if $expr eq ""; for(;;) { my $top = $args[$#args]; my $ttype = (ref($top) eq "ARRAY" ? "e" : $top); last if($ctype eq "\$" && $ttype eq "\$"); unless($cur) { if($expr =~ /^(f[sg]?|len)\s*(!?=|<=?|>=?)\s*(\d+)(.*)$/i) { $cur = [ lc($1), $2, 0 + $3 ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^(f[sg]?)\s*(!?=|<=?|>=?)\s*(f[sg]?|len)(.*)$/i) { $cur = [ lc($1), $2, lc($3) ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^(v[sg]?)\s*(!?=|<=?|>=?)\s*(\d+(\.\d+)?)(.*)$/i) { $cur = [ lc($1), $2, 0 + $3 ]; $ctype = "e"; $expr = $5; } elsif($expr =~ /^(v[sg]?)\s*(!?=|<=?|>=?)\s*(v[sg]?|c|p[sn]?)(.*)$/i) { $cur = [ lc($1), $2, lc($3) ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^(c|p[sn]?)\s*(!?=|<=?|>=?)\s*(0|0?\.\d+)(.*)$/i) { $cur = [ lc($1), $2, 0 + $3 ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^(c|p[sn]?)\s*(!?=|<=?|>=?)\s*(v[sg]?|c|p[sn]?)(.*)$/i) { $cur = [ lc($1), $2, lc($3) ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^(key|pfx|sfx)\s*(=)\s*([a-z\'\$\*]+)(.*)$/i) { $cur = [ lc($1), $2, $3 ]; $ctype = "e"; $expr = $4; } elsif($expr =~ /^([\&\|\(\)])(.*)$/) { $cur = $ctype = $1; $expr = $2; } elsif($expr =~ /^\s+(.*)$/) { $expr = $1; next; # Skip whitespace } elsif($expr eq "") { $cur = $ctype = "\$"; } else { return "Invalid input"; # Lexical error } } my $rel = $optab{$ttype}->{$ctype}; if($rel eq "<" || $rel eq "=") { push(@args, $cur); $cur = ""; } elsif($rel eq ">") { my($pop, $ptype); do { $pop = $top; pop(@args); $ptype = $ttype; $top = $args[$#args]; $ttype = (ref($top) eq "ARRAY" ? "e" : $top); if($ptype eq "e") { push(@ops, $pop); } elsif($ptype eq "&" || $ptype eq "|") { my $t2 = pop(@ops); my $t1 = pop(@ops); unless($t1 && $t2) { return "Missing operand"; } push(@ops, [ $ptype, $t1, $t2 ]); } } while(@args != 0 && $optab{$ttype}->{$ptype} ne "<"); } else { return "Syntax error"; } } if($ctype ne "\$") { return "Unexpected junk"; } return $ops[0] || "Nothing"; } # }}} # {{{ match_filter_expr(word, expr) sub match_filter_expr { my($word, $expr) = @_; my($key) = $expr->[0]; if($key eq "&") { match_filter_expr($word, $expr->[1]) && match_filter_expr($word, $expr->[2]); } elsif($key eq "|") { match_filter_expr($word, $expr->[1]) || match_filter_expr($word, $expr->[2]); } else { match_simple_expr($word, $expr); } } # }}} # {{{ match_simple_expr(word, expr) sub BEGIN { # Property keys used for filter matching %keymap = ( "p" => "rprob", "ps" => "rprob", "pn" => "sprob", "f" => "freq", "fs" => "bad", "fg" => "good", "v" => "average", "vs" => "bad_average", "vg" => "good_average", "c" => "crap", ); sub match_simple_expr { my($info, $expr) = @_; my($key, $cmp, $rhs) = @$expr; if(exists($keymap{$key})) { # If the RHS is the name of another word value, grab that # value; otherwise, it should be a plain number. if(exists($keymap{$rhs})) { $rhs = $info->{$keymap{$rhs}}; } return match_comparison($info->{$keymap{$key}}, $cmp, $rhs); } elsif($key eq "len") { return match_comparison(length($info->{"key"}), $cmp, $rhs); } elsif($key eq "key") { return $key eq $info->{"key"}; } elsif($key eq "pfx") { return $info->{"key"} =~ /^$rhs/; } elsif($key eq "sfx") { return $info->{"key"} =~ /$rhs$/; } else { return 1; } 0; } } # }}} # {{{ match_comparison(lhs, cmp, rhs) sub match_comparison { my($lhs, $cmp, $rhs) = @_; return (($cmp eq "=" && $lhs == $rhs) || ($cmp eq "<" && $lhs < $rhs) || ($cmp eq ">" && $lhs > $rhs) || ($cmp eq "<=" && $lhs <= $rhs) || ($cmp eq ">=" && $lhs >= $rhs) || ($cmp eq "!=" && $lhs != $rhs)); } # }}} # {{{ get_word_info($table, $word) sub get_word_info { my($table, $word) = @_; my($good, $bad); my $out = { "key" => $word }; if(exists($table->{$word})) { ($good, $bad) = split(/:/, $table->{$word}); } $out->{"freq"} = $good + $bad; $out->{"good"} = $good; $out->{"bad"} = $bad; $out->{"average"} = (($good + $bad) / ($num_total_msgs || 1)); $out->{"good_average"} = ($good / ($num_good_msgs || 1)); $out->{"bad_average"} = ($bad / ($num_spam_msgs || 1)); $out->{"rprob"} = spam_prob($table, $word, 0); $out->{"sprob"} = spam_prob($table, $word, 1); $out->{"crap"} = word_crap_index($word); $out } # }}} # {{{ print_filter_expr(expr) sub print_filter_expr { my($expr) = shift; if($expr->[0] eq "&" || $expr->[0] eq "|") { print STDERR "("; print_filter_expr($expr->[1]); print STDERR " ", $expr->[0], " "; print_filter_expr($expr->[2]); print STDERR ")"; } else { printf STDERR ("%s %s %s", @$expr); } } # }}} # HERE THERE BE DRAGONS