#!/usr/bin/perl -w
#
# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
# work out efficiency.  This variant uses static thresholds that
# model more closely what SpamAssassin uses for scoring.
# 
# usage: bayes-threshold spam.log nonspam.log

my $spam = $ARGV[0] || "spam.log";
my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");

my $hamcutoff = 0.20;
my $spamcutoff = 0.80;

my $nbuckets = 50;
my $range_lo = 0.00;
my $range_hi = 1.00;
my $step = 0.02;
#my $step = ($range_hi - $range_lo) / $nbuckets;  # unusable - round errors!

$hamcutoff += 0.0;
$spamcutoff += 0.0;

# shamelessly nicked from spambayes' testing infrastructure; a system to
# compute the "cost" of a pair of thresholds and classifier.  I'm supporting
# it here so our stats are (at least a little) comparable against theirs.

my $best_cutoff_fp_weight = 10.0;
my $best_cutoff_fn_weight = 1.0;
my $best_cutoff_unsure_weight = 0.1;

%bux_sp = ();
%bux_ns = ();

my $i;
for ($i = 0; $i <= ($range_hi - $range_lo) / $step; $i++)
{
  my $lvl = $range_lo + ($i * $step);
  push (@buckets, $lvl);
  $bux_ns{$lvl} = $bux_sp{$lvl} = 0;
}

foreach my $file ($spam, $nonspam) {
  open (IN, "<$file") || die "Could not open file '$file': $!";

  my $isspam = 0; ($file eq $spam) and $isspam = 1;

  while (<IN>) {
    /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
    my $score = $2+0;
    if ($score == 1) { $score = 0.9999999999999; }

    my $bucket_id;
    foreach my $bucket (@buckets) {
      if ($score >= $bucket && $score < $bucket+$step) {
        $bucket_id = $bucket; last;
      }
    }

    if (!defined $bucket_id) {
      warn "no bucket for $score!";
    }
    if (!defined $bux_sp{$bucket_id}) {
      warn "undef bucket at $bucket_id! (score=$bucket)";
    }
    if (!defined $bux_ns{$bucket_id}) {
      warn "undef bucket at $bucket_id! (score=$bucket)";
    }
    if ($isspam) {
      $bux_sp{$bucket_id}++;
    } else {
      $bux_ns{$bucket_id}++;
    }
  }
}

my $max_sp = 0;
my $max_ns = 0;
my $tot_sp = 0;
my $tot_ns = 0;
foreach my $bucket (@buckets) {
  $tot_sp += $bux_sp{$bucket};
  if ($bux_sp{$bucket} > $max_sp) 
                        { $max_sp = $bux_sp{$bucket}; }
  $tot_ns += $bux_ns{$bucket};
  if ($bux_ns{$bucket} > $max_ns) 
                        { $max_ns = $bux_ns{$bucket}; }
}

foreach my $cutoff (0.3, 0.2, 0.1, 0.04, 0.02) {
  $hamcutoff = $cutoff;
  $spamcutoff = 1.0 - $cutoff;

  my %results = results_for_cutoff ($hamcutoff, $spamcutoff);
  write_results (%results);
}

sub results_for_cutoff {
  my %results = ();

  my $fn = 0;
  my $fp = 0;
  my $unsure_sp = 0;
  my $unsure_ns = 0;

  for ($i = $range_lo; $i < $hamcutoff; $i += $step) {
    foreach my $bucket (@buckets) {
      if ($i >= $bucket && $i < $bucket+$step) {
        $fn += $bux_sp{$bucket};
      }
    }
  }
  # total up the unsures (between hamcutoff and spamcutoff)
  for ($i = $hamcutoff; $i <= $spamcutoff; $i += $step) {
    foreach my $bucket (@buckets) {
      if ($i >= $bucket && $i < $bucket+$step) {
        $unsure_ns += $bux_ns{$bucket};
        $unsure_sp += $bux_sp{$bucket};
      }
    }
  }
  for ($i = $spamcutoff; $i <= $range_hi; $i += $step) {
    foreach my $bucket (@buckets) {
      if ($i >= $bucket && $i < $bucket+$step) {
        $fp += $bux_ns{$bucket};
      }
    }
  }

  my $cost = ($fp * $best_cutoff_fp_weight) 
              + ($fn * $best_cutoff_fn_weight)
              + ($unsure_ns * $best_cutoff_unsure_weight)
              + ($unsure_sp * $best_cutoff_unsure_weight);

  $results{"$hamcutoff $spamcutoff"} = {
    'hamcutoff' => $hamcutoff,
    'spamcutoff'=> $spamcutoff,
    'cost' => $cost,
    'unsure_ns' => $unsure_ns,
    'unsure_sp' => $unsure_sp,
    'fp' => $fp,
    'fn' => $fn
  };

  return %results;
}

sub write_results {
  my (%results) = @_;
  foreach my $r (values %results) {
    printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n",
                  $r->{hamcutoff}, $r->{spamcutoff}, $r->{cost};
    printf "Total ham:spam:   %d:%d\n", $tot_ns, $tot_sp;

    printf "FP: %5d %5.3f%%    ", $r->{fp}, ($r->{fp}*100) / $tot_ns;
    printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp;

    my $unsure = $r->{unsure_ns} + $r->{unsure_sp};
    printf "Unsure: %5d %5.3f%%     ", $unsure,
                                  ($unsure*100) / ($tot_sp+$tot_ns);
    printf "(ham: %5d %5.3f%%    ", $r->{unsure_ns},
                                  ($r->{unsure_ns}*100) / ($tot_ns);
    printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp},
                                  ($r->{unsure_sp}*100) / ($tot_sp);

    # for TCR calc, treat "unsures" as ham
    # TODO: unsure_sp should probably be treated as spam, assuming
    # it'll fall in the 5.0-6.0 score range
    my $fn = $r->{unsure_sp} + $r->{fn};
    my $fp = $r->{fp};
    printf "TCRs:              l=1 %5.3f    l=5 %5.3f    l=9 %5.3f\n",
      tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1),
      tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5),
      tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9);

    printf "SUMMARY: %3.2f/%3.2f  fp %5d fn %5d uh %5d us %5d    c %5.2f\n",
          $r->{hamcutoff}, $r->{spamcutoff},
          $r->{fp}, $r->{fn}, $r->{unsure_ns}, $r->{unsure_sp},
          $r->{cost};

    print "\n";
  }
}


sub tcr {
  my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_;

  my $nlegit = $nlegitspam+$nlegitlegit;
  my $nspam = $nspamspam+$nspamlegit;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
		  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
		  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;

#my $sr = ($nspamspam / $nspam) * 100.0;
#my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;

  $tcr;
}

