#!/usr/bin/env perl
#
# Test suite for vw:
#
# You may add arbitrary (train/test/varying-options) tests
# by adding data files and their expected reference STDOUT and STDERR
#
# See __DATA__ below for how to add more tests
#
require 5.014;
use warnings;

use Getopt::Std;
use vars qw($opt_d $opt_D $opt_c $opt_e $opt_f
            $opt_E $opt_o $opt_w $opt_y $opt_t
            $opt_v $opt_V);

my $Epsilon = 1e-4;

my $VW;
my $LDA;

# External utilities we use. See init() for Windows specific actions.
my $Diff = 'diff';
my $Cat = 'cat';

$ENV{'PATH'} .= ':test:.:vowpalwabbit:../vowpalwabbit';

# -V prefixes valgrind like this, we should adjust the default
# options over time to what looks most useful.
my $Valgrind = 'valgrind --quiet --error-exitcode=100 --track-origins=yes';

# -- timeout is part of GNU coreutils, some systems may not have it
my $TimeOut = '';
my $TimeOutSec = 30;    # max allowed time for single vw command run

# By default, we run all tests in the list
my $FullRun = 1;
my $ErrorCount = 0;

# These --side-by-side diff opts are used to make the
# fuzzy-compare easier: just split on '|' and compare numeric values
# word by word:
my $DiffOpts = '-N --minimal --suppress-common-lines --ignore-all-space --strip-trailing-cr --side-by-side -W 160';

# These diff options are used for the diff we want to show the user
# The intent is to make them easier to parse (and compare values) by a human
my $DisplayDiffOpts = '-u --minimal';

my @PathAdd = qw(. .. ../vowpalwabbit);

my @ToTest = ();

# __DATA__ test counter
my $TestNo = 0;

sub v($;@) {
    my $verbose_level = shift @_;
    return unless ($opt_v >= $verbose_level);

    if (@_ == 1) {
        print STDERR @_;
    } else {
        printf STDERR @_;
    }
}

sub usage(@) {
    print STDERR @_, "\n" if (@_);

    die "Usage: $0 [options] [testno...] [vw-executable]
    By default will run against the 1st 'vw' executable found in:
        @PathAdd  \$PATH

    Options:
        -c      print test-suite commands before running them
        -d      print diff output on significant diff failure
        -D      print diff output even if it is not significant
        -e      exit with non-zero status on first error
        -w      Ignore white-space differences (diff --ignore-space-change)
        -f      Ignore small (< $Epsilon) floating-point differences (fuzzy compare)
        -E<e>   Tolerance epsilon <e> for fuzzy compares (default $Epsilon)
        -o      Overwrite reference file with new/different result
        -y      On error, copy bad files to (eg stderr.test21) for later comparison
        -v<L>   Verbosity <L> (small integer) is verbosity level
        -V      apply valgrind to vw commands
        -t<T>   Apply timeout <T> (default $TimeOutSec) secs to individual tests
                (will only work where GNU coreutils 'timeout' is present)

    [testno...]   Optional integer args: explicit test numbers (skip others)
";
}

sub mysystem {
    my $cmd = shift;
    v(1, "%s\n", $cmd);
    system($cmd);
}

sub command_failed($) {
    # Deal with cases where vw crashes, exits prematurely etc.
    # print a message to distinguish between all cases
    # return non-zero status if anything is bad
    my $cmd = shift;
    my $exitcode = 0;
    if ($?) {
        $exitcode = $? >> 8;
        my $signal = $? & 127; 
        my $core = ''; if ($? & 128) { $core = ' (core dumped)'; }
        if ($signal) {
            printf STDERR
                "$0: test $TestNo: '%s' died from signal $signal$core\n", $cmd;
            $exitcode = 1;
        } elsif ($exitcode == 124) {
            printf STDERR
                "$0: test $TestNo: '%s' timed-out (exitcode=$exitcode)\n" .
                "$0: test $TestNo: you may increase the imposed time-out: \$TimeOutSec=%d\n",
                $cmd, $TimeOutSec;
        } elsif ($exitcode) {
            printf STDERR
                "$0: test $TestNo: '%s' failed (exitcode=$exitcode)\n", $cmd;
        }
    }
    # This is non-zero only if $cmd failed
    $exitcode;
}

sub valgrind_errfile($) {
    my $testno = shift;
    "Test-$testno.valgrind-err";
}

#
# which vw executable to test against
#
sub which_vw() {
    if (@ARGV == 1 || @ARGV == 2) {
        my $exe = $ARGV[0];
        if (-f $exe && -x $exe) {
            printf STDERR "Testing vw: %s\n", $exe;
            return $exe;
        } else {
            usage("$0: argument $exe: not an executable file");
        }
    } elsif (@ARGV == 0) {
        foreach my $dir (@PathAdd, split(':', $ENV{PATH})) {
            my $exe = "$dir/vw";
            if (-x $exe) {
                printf STDERR "Testing vw: %s\n", $exe;
                return $exe;
            }
        }
    }
    usage("can't find a 'vw' executable to test on");
}

sub which_lda() {
    if (@ARGV == 2) {
        my $exe = $ARGV[1];
        if (-f $exe && -x $exe) {
            printf STDERR "Testing lda: %s\n", $exe;
            return $exe;
        } else {
            usage("$0: argument $exe: not an executable file");
        }
    } elsif (@ARGV == 0 || @ARGV == 1) {
        foreach my $dir (@PathAdd, split(':', $ENV{PATH})) {
            my $exe = "$dir/vw";
            if (-x $exe) {
                printf STDERR "Testing lda: %s\n", $exe;
                return $exe;
            }
        }
    }
    usage("can't find a 'lda' executable to test on");
}

sub init() {
    $0 =~ s{.*/}{};
    getopts('wcdDefyE:ov:Vt:') || usage();
    $opt_v = 0 unless (defined $opt_v and $opt_v);

    my $hostname = `hostname`; chomp($hostname);
    printf STDERR "Testing on: hostname=%s OS=%s\n", $hostname, $^O;

    if ($^O =~ /MSWin/i) {
        v(1, "OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
        $Diff  = 'c:\cygwin\bin\diff.exe';
        $Cat   = 'c:\cygwin\bin\cat.exe';
    }
    elsif ($^O =~ /cygwin/i){
        v(1,"OS is $^O\n");
        # On MS Windows we need to change paths to external executables
        # Assumes cygwin is installed
        $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin';
        # And just to be safe (probably not needed):
        $Diff  = 'c:/cygwin/bin/diff.exe';
        $Cat   = 'c:/cygwin/bin/cat.exe';
    }
    $Epsilon = $opt_E if ($opt_E);
    $Diff .= ' --ignore-space-change' if ($opt_w);
    my @num_args = ();
    my @exe_args = ();
    foreach my $arg (@ARGV) {
        if ($arg =~ /^\d+$/) {  # a test number
            push(@num_args, $arg);
            next;
        }
        push(@exe_args, $arg);
    }
    if (@num_args) {
        @ToTest = sort { $a <=> $b } @num_args;
        # add dummy element so we don't become empty on last test
        push(@ToTest, -1);
        $FullRun = 0;
    }
    @ARGV = @exe_args;

    $VW = which_vw();
    $LDA = which_lda();

    my $timeout = `which timeout 2>/dev/null`;
    if ($timeout =~ /timeout$/) {
        chomp($timeout);
        $TimeOut = $timeout;
        v(1,"timeout is: %s\n", $TimeOut);
    }
    if ($opt_t) {
        if ($opt_t =~ /^\d+$/) {
            $TimeOutSec = $opt_t;
        } else {
            usage("-t $opt_t: -t can only accept integer seconds");
        }
        warn "-t passed but this env doesn't have timeout installed\n"
            unless ($TimeOut);
    }
}

sub copy_file {
    my ($src_file, $dst_file) = @_;
    use File::Copy;
    print STDERR "\t\t-> copying output to $dst_file\n";
    copy($src_file, $dst_file);
}

sub trim_spaces($) {
    my $str = shift;
    $str =~ s/^\s+//;
    $str =~ s/\s+$//;
    $str;
}

#
# ref_file($default_name)
#   Reference file existence: if we're on Windows, AND
#   an alternate reference-file exists, give precedence
#   to the alternate file (file with a '-mswin' suffix.)
#
sub ref_file($) {
    my $file = shift;
    if ($^O =~ /MSWin/i) {
        my $win_reffile = "$file-mswin";
        if (-e $win_reffile) {
            return $win_reffile;
        }
    }
    $file;
}

sub next_test() {
    my ($cmd, $out_ref, $err_ref, $pred_ref, $pred);

    $TestNo++;
    while (! eof(DATA)) {
        my $line = <DATA>;
        last if (defined($line) && $line =~ /^\s*$/);

        next if ($line =~ /^\s*#/);  # skip comment lines

        if ($line =~ /{VW}/) {
            # The command line
            $cmd = trim_spaces($line);
            $cmd =~ s/{VW}/$VW/;
            if ($cmd =~ /\s-p\s+(\S+)/) {
                # -p predict_file
                $pred = $1;
            }
            next;
        }
        if ($line =~ /{LDA}/) {
            # The command line
            $cmd = trim_spaces($line);
            $cmd =~ s/{LDA}/$LDA/;
            if ($cmd =~ /\s-p\s+(\S+)/) {
                # -p predict_file
                $pred = $1;
            }
            next;
        }
        if ($line =~ m/\.stdout\b/) {
            $out_ref = ref_file(trim_spaces($line));
            next;
        }
        if ($line =~ /\.stderr\b/) {
            $err_ref = ref_file(trim_spaces($line));
            next;
        }
        if ($line =~ /\.predict\b/) {
            $pred_ref = ref_file(trim_spaces($line));
            next;
        }
        # A test can also be an arbitrary (shell) command line
        # if the 1st non-space sequence is executable
        my ($first_word) = ($line =~ /^\s*(\S+)/);
        if (defined $first_word && -x $first_word) {
            $cmd = trim_spaces($line);
            next;
        }
        # if we get here it is some unrecognized pattern
        if (! -e $first_word) {
            # 1st word must exist, either as a reference-file
            # or some command
            printf STDERR "$0: %s: %s\n", $first_word, $!;
        }
        printf STDERR "$0: failed to run test line:\n\t%s\n", $line;
        print STDERR "$0: test lines must be one of the following:\n";
        print STDERR "\tshell command to run\n";
        print STDERR "\tvw command to run: {VW} ...\n";
        print STDERR "\tstdout reference:  *.stdout\n";
        print STDERR "\tstderr reference:  *.stderr\n";
        print STDERR "\tpredict reference: *.predict\n";
    }
    if (eof(DATA) && !defined $cmd) {
        return (undef, undef, undef, undef);
    }
    unless (defined $cmd) {
        die "$0: test $TestNo: command is undefined\n";
    }
    unless (defined $err_ref) {
        v(2, "%s: test %s: stderr ref: undefined\n", $0, $TestNo);
        $err_ref = '/dev/null';
    }
    # print STDERR "next_test: (\$cmd, $out_ref, $err_ref, $pred_ref, $pred)\n";
    if ($opt_V) {
        $cmd = sprintf("%s --log-file='%s' %s",
                        $Valgrind, valgrind_errfile($TestNo), $cmd);
    } elsif ($TimeOut) {
        $cmd = sprintf("%s %u %s", $TimeOut, $TimeOutSec, $cmd);
    }
    ($cmd, $out_ref, $err_ref, $pred_ref, $pred);
}

#
# If the difference is small (least significant digits of numbers)
# treat it as ok. It may be a result of 32 vs 64 bit calculations.
#
use Scalar::Util qw(looks_like_number);

sub lenient_array_compare($$) {
    my ($w1_ref, $w2_ref) = @_;
    my (@w1) = @$w1_ref;
    my (@w2) = @$w2_ref;

    # print STDERR "lenient_array_compare: (@w1) (@w2)\n";
    return 1 if ($#w1 != $#w2); # arrays not of same size
    my $nelem = scalar @w1;
    for (my $i = 0; $i < $nelem; $i++) {
        my ($word1, $word2) = ($w1[$i], $w2[$i]);
        # print STDERR "\t$word1 == $word2 ?\n";
        next if ($word1 eq $word2);

        # There's some difference, is it significant?
        unless (looks_like_number($word1)) {
            v(4, "$word1 vs $word2: word1=$word1 is not a number!\n");
            return 1;
        }
        unless (looks_like_number($word2)) {
            v(4, "$word1 vs $word2: word2=$word2 is not a number!\n");
            return 1;
        }

        my $delta = abs($word1 - $word2);

        if ($delta > $Epsilon) {
            # We have a 'big enough' difference, but this difference
            # may still not be meaningful in all contexts:

            # Big numbers should be compared by ratio rather than
            # by difference

            # Must ensure we can divide (avoid div-by-0)
            if (abs($word2) <= 1.0) {
                # If numbers are so small (close to zero),
                # ($delta > $Epsilon) suffices for deciding that
                # the numbers are meaningfully different
                v(4, "$word1 vs $word2: delta=$delta > Epsilon=$Epsilon\n");
                return 1;
            }
            # Now we can safely divide (since abs($word2) > 0)
            # and determine the ratio difference from 1.0
            my $ratio_delta = abs($word1/$word2 - 1.0);
            if ($ratio_delta > $Epsilon) {
                v(4, "$word1 vs $word2: ratio_delta=$ratio_delta > Epsilon=$Epsilon\n");
                return 1;
            }
        }
    }
    # print STDERR "lenient_array_compare: no meaningful difference\n";
    return 0; # no meaningful difference
}

sub diff_lenient_float($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;

    my $tmpf = 'lenient-diff.tmp';
    mysystem("$Diff $DiffOpts $reffile $outfile >$tmpf");
    $status = $? >> 8;
    v(2, "diff produced $tmpf: status=$status\n");
    if (-s $tmpf) {
        # The diff has something in it.
        my $fuzzy_status = 0;   # assume innocent till proven guilty
        open(my $sdiff, $tmpf) || die "$0: diff_lenient_float: $tmpf: $!\n";
        while (<$sdiff>) {
            chomp;
            my ($line1, $line2) = split(/\s*\|\s*/, $_);
            unless (defined($line1) && defined($line2)) {
                my $save_diff_file = "test-$TestNo.lenient-diff";
                warn "$0: test $TestNo: $tmpf: line $.: fuzzy-match missing data on one of the sides. Can't compare\n$_\n";
                warn "$0: test $TestNo: saving lenient diff in '$save_diff_file' for later inspection\n";
                close $sdiff;
                rename($tmpf, $save_diff_file);
                return 1;
            }
            v(3, "line1: %s\n", $line1);
            v(3, "line2: %s\n", $line2);

            # Break lines into tokens/words
            my (@w1) = split(' ', $line1);
            my (@w2) = split(' ', $line2);
            if (lenient_array_compare(\@w1, \@w2) != 0) {
                $fuzzy_status = 1;
                last;
            }
        }
        close $sdiff;
        $status = $fuzzy_status;
    }
    $status;
}

#
# perl internal way to emulate 'touch'
#
sub touch(@) {
    my $now = time;
    utime $now, $now, @_;
}

sub display_diff($$) {
    my ($reference_file, $actual_file) = @_;
    my $diff_cmd = "$Diff $DisplayDiffOpts $reference_file $actual_file";

    printf STDERR "--- %s\n", $diff_cmd;
    mysystem($diff_cmd);
}

sub diff($$) {
    my ($reffile, $outfile) = @_;
    my $status = 0;
    $reffile = '' unless (defined $reffile);

    # Special case, empty file w/o reference is not considered a failure.
    # This is a most common case with stdout.
    unless (-e $reffile) {
        if (-s $outfile > 0) {
            warn "$0: test $TestNo: stdout ref: $reffile: $!\n";
            exit 1 if ($opt_e);
            return 2 unless ($opt_o);
        } else {
            # Empty output without a ref is not considered a failure
            v(1, "$0: test $TestNo: empty output w/o reference: ignored.\n");
            return 0;
        }
    }

    # Actually run the diff
    my $diff_cmd = "$Diff $DiffOpts $reffile $outfile";
    mysystem("$diff_cmd >diff.tmp");
    $status = $? >> 8;
    v(2, "$diff_cmd >diff.tmp: status=$status\n");
 
    if (-s 'diff.tmp') {
        # There's some difference
        v(2, "diff.tmp has something in it. Is it meaningful?\n");

        if ($opt_f && -e $reffile && -e $outfile &&
            diff_lenient_float($reffile, $outfile) == 0) {

            print STDERR "$0: test $TestNo: minor (<$Epsilon) precision differences ignored\n";
            $status = 0;
        }
        if ($opt_D or ($opt_d && $status)) {
            # Print the diff only iff:
            #   1) -D is in effect  OR
            #   2) -d is in effect and diff is significant
            display_diff($reffile, $outfile);
        }
        if ($opt_o) {
            print STDERR "-o: overwriting reference:\n";

            if (-e $reffile) {
                print STDERR "\t$reffile -> $reffile.prev\n";
                rename($reffile, "$reffile.prev") ||
                    die "FATAL: rename($reffile, $reffile.prev): $!\n";
            }
            print STDERR "\t$outfile -> $reffile\n";
            rename($outfile, $reffile) ||
                die "FATAL: rename($outfile, $reffile): $!\n";

            unless ($opt_e) {
                $status = 0;
            }
        }
    }
    $status;
}

#
# check_for_time_regression()
#   Compare last overall time to run to current to catch
#   performance regressions
#
my $LastTimeFile = 'RunTests.last.times';

sub write_times($@) {
    my ($file, @times) = @_;
    open(my $fh, ">$file") || die "$0: can't open(>$file): $!\n";
    print $fh join(' ', @times), "\n";
    close $fh;
}
sub read_times($) {
    my ($file) = @_;
    open(my $fh, $file) || die "$0: can't open($file): $!\n";
    my $line = <$fh>; chomp $line;
    close $fh;
    return (split(' ', $line));
}

sub check_for_time_regression() {
    my $tolerate_regress = 1.02;
    my $pct_change = 0.0;
    my ($overall_time0, $overall_time1);
    my ($user0, $system0, $cuser0, $csystem0);
    my ($user1, $system1, $cuser1, $csystem1) = times;
    $overall_time1 = $cuser1 + $csystem1;

    if (-e $LastTimeFile) {
        ($user0, $system0, $cuser0, $csystem0) = read_times($LastTimeFile);
        if (!(defined $csystem0) or !(defined $cuser0)) {
            die "$0: undefined times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        }
        $overall_time0 = $cuser0 + $csystem0;
        $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0);

        if ($overall_time0 == 0) {
            die "$0: Bad times in saved times file: $LastTimeFile," .
                    " try removing it\n"
        } elsif ($overall_time1/$overall_time0 > $tolerate_regress) {
            printf STDERR "$0: RUNTIME REGRESSION: " .
                    "%.2f sec vs last time %.2f sec. (%.2f%% worse)\n",
                    $overall_time1, $overall_time0, $pct_change;
        }
    }
    write_times($LastTimeFile, $user1, $system1, $cuser1, $csystem1);
    printf STDERR
        "$0 runtime: user %g, system %g, total %g sec (%+.2f%% vs. last)\n",
                $cuser1, $csystem1, $overall_time1, $pct_change;
}

sub run_tests() {
    print STDERR "$0: '-D' to see any diff output\n"
        unless ($opt_D);
    print STDERR "$0: '-d' to see only significant diff output\n"
        unless ($opt_d);
    print STDERR "$0: '-o' to force overwrite references\n"
        unless ($opt_o);
    print STDERR "$0: '-e' to abort/exit on first failure\n"
        unless ($opt_e);

    my ($cmd, $out_ref, $err_ref, $pred_ref);
    my ($outf, $errf, $predf);

    mkdir('models', 0755) unless (-d 'models');

    unlink(glob('*.tmp'));
    unlink(glob('*.cache'));
    unlink(glob('*/*.cache'));

    while (($cmd, $out_ref, $err_ref, $pred_ref, $predf) = next_test()) {
        last unless (defined $cmd);
        if (@ToTest) {
            if ($ToTest[0] != $TestNo) {
                # warn "$0: test $TestNo: skipped\n";
                next;
            } else {
                shift(@ToTest);
            }
        }

        ($outf, $errf) = ('stdout.tmp', 'stderr.tmp');
        unlink $outf, $errf;    # just in case, to be safer

        # run the test
        print STDERR "Test $TestNo: ($cmd) >$outf 2>$errf\n" if ($opt_c);
        mysystem("($cmd) >$outf 2>$errf");
        my $full_status = $?;
        my $status = $full_status >> 8;
        unless ($opt_V) {
            if (my $failure = command_failed($cmd)) {
                print STDERR `$Cat $errf`
                    unless ($failure == 124);
                if ($opt_e) {
                    printf STDERR "$0: exiting with status=$failure\n";
                    exit $failure;
                }
                next;
            }
        }
        if ($status) {
            $ErrorCount++;
            if ($opt_V && $status == 100) {
                my $errfile = valgrind_errfile($TestNo);
                warn "$0: test $TestNo: FAILED: valgrind errors in $errfile\n";
            } elsif ($TimeOut && $status == 124) {
                warn "$0: test $TestNo: FAILED: timeout $TimeOutSec exceeded\n";
            } else {
                warn "$0: test $TestNo: '$cmd' failed: status=$status\n";
            }
            exit $full_status if ($opt_e);
            next;
        }

        # command succeded
        # -- compare stdout
        $status = diff($out_ref, $outf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stdout(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $out_ref, $outf;
            if ($opt_y) { copy_file($outf, $outf . '.test' . $TestNo); }
            exit $status if ($opt_e);
        } else {
            if (defined $out_ref) {
                print STDERR "$0: test $TestNo: stdout OK\n";
            } else {
                v(1, "$0: test $TestNo: stdout OK (no reference)\n");
            }
        }

        # -- compare stderr
        if (! -e $err_ref  and  ! $opt_o) {
            $ErrorCount++;
            print STDERR "$0: test $TestNo: FAILED: stderr ref: $err_ref: $!\n\tcmd: $cmd\n";
            exit 1 if ($opt_e);
            next;
        }
        $status = diff($err_ref, $errf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != stderr(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $err_ref, $errf;
            if ($opt_y) { copy_file($errf, "$errf.test.$TestNo"); }
            exit $status if ($opt_e);
        } else {
            print STDERR "$0: test $TestNo: stderr OK\n";
        }
        # -- compare predict
        next unless (defined $pred_ref);
        $predf = 'predict.tmp' unless (defined $predf);
        $status = diff($pred_ref, $predf);
        if ($status) {
            $ErrorCount++;
            printf STDERR "%s: test %d: FAILED: ref(%s) != predict(%s)\n\tcmd: $cmd\n",
                $0, $TestNo, $pred_ref, $predf;
            if ($opt_y) { copy_file($predf, "$predf.test.$TestNo"); }
            exit $status if ($opt_e);
        } else {
            print STDERR "$0: test $TestNo: predict OK\n";
        }
    }

    if ($FullRun == 0) {
        v(1, "Partial run: not recording overall time\n");
    } elsif ($ErrorCount > 0) {
        v(1, "Errors found: not recording overall time\n");
    } elsif ($opt_V) {
        v(1, "valgrind run: not recording overall time\n");
    } else {
        check_for_time_regression();
    }
}

# --- main
init();
run_tests();
exit $ErrorCount;

#
# Add tests below the __DATA__ line
# Each test is a series of lines, terminated by an empty line (or EOF)
#
# Each test is comprised of:
#   1st line-item is the command to run, {VW} represents the vw
#   executable.
#
#   By default, 'vw' in the parent dir (../vw) is tested.
#   To run against a different reference executable, just pass the
#   executable as an argument to RunTests
#
# The next (output) line-items are reference files to compare outputs to:
#    The expected (reference file) standard output
#    The expected (reference file) standard error
#    The expected (reference file) for predictions (-p ...)
#    [The above reference files can come in any order.
#     Their 'type' is determined by their extensions:
#            .stdout  .stderr  .predict
#    ]
#
# All filenames are relative to this (test) directory
#
# The temporary output file-names (as opposed to the reference ones)
# are implicit:
#    (stdout.tmp  stderr.tmp  predict.tmp)
# Except: if -p ... appears in the command, it will be used as the
# (explicit) predictions file.
#
# Windows note:
#
# Due to differences in Random-Number-Generators in Windows,
# floating-point outputs may differ in some tests (not all).
#
# To minimize the need for changes (leverage existing tests and
# reference files as much as possible), on Windows we check for
# existance of files with '-mswin' suffix:
#   *.stderr-mswin
#   *.stdout-mswin
#   *.predict-mswin
# and if any of them exists, we use it instead.
#
__DATA__
# Test 1:
{VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat -f models/0001.model -c --passes 8 --invariant --ngram 3 --skips 1 --holdout_off
    train-sets/ref/0001.stderr

# Test 2: checking predictions as well
{VW} -k -t train-sets/0001.dat -i models/0001.model -p 001.predict.tmp --invariant
    test-sets/ref/0001.stderr
    pred-sets/ref/0001.predict

# Test 3: without -d, training only
{VW} -k train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stderr

# Test 4: same, with -d
{VW} -k -d train-sets/0002.dat -f models/0002.model --invariant
    train-sets/ref/0002.stdout
    train-sets/ref/0002.stderr

# Test 5: add -q .., adaptive, and more (same input, different outputs)
{VW} -k --initial_t 1 --adaptive --invariant -q Tf -q ff -f models/0002a.model train-sets/0002.dat
    train-sets/ref/0002a.stderr

# Test 6: run predictions on Test 4 model
# Pretending the labels aren't there
{VW} -k -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict
    test-sets/ref/0002b.stderr
    pred-sets/ref/0002b.predict

# Test 7: using normalized adaptive updates and a low --power_t
{VW} -k --power_t 0.45 -f models/0002c.model train-sets/0002.dat
    train-sets/ref/0002c.stderr

# Test 8: predicts on test 7 model
{VW} -k -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict
    test-sets/ref/0002c.stderr
    pred-sets/ref/0002c.predict

# Test 9: label-dependent features with csoaa_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --invariant --csoaa_ldf multiline --holdout_off
    train-sets/ref/cs_test.ldf.csoaa.stderr
    train-sets/ref/cs_test.ldf.csoaa.predict

# Test 10: label-dependent features with wap_ldf
{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --invariant --wap_ldf multiline --holdout_off
    train-sets/ref/cs_test.ldf.wap.stderr
    train-sets/ref/cs_test.ldf.wap.predict

# Test 11: one-against-all
{VW} -k --oaa 10 -c --passes 10 train-sets/multiclass --holdout_off
    train-sets/ref/oaa.stderr

# Test 12: Error Correcting Tournament
{VW} -k --ect 10 --error 3 -c --passes 10 --invariant train-sets/multiclass --holdout_off
    train-sets/ref/multiclass.stderr

# Test 13: Run search (dagger) on wsj_small for 6 passes extra features
{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 6 --search_task sequence --search 45 --search_alpha 1e-6 --search_max_bias_ngram_length 2 --search_max_quad_ngram_length 1 --holdout_off
    train-sets/ref/search_wsj.stderr

# Test 14: Run search (searn) on wsj_small for 6 passes extra features
{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 6 --search_task sequence --search 45 --search_alpha 1e-6 --search_max_bias_ngram_length 2 --search_max_quad_ngram_length 1 --holdout_off --search_passes_per_policy 3 --search_interpolation policy
    train-sets/ref/search_wsj2.dat.stdout
    train-sets/ref/search_wsj2.dat.stderr

# Test 15: LBFGS on zero derivative input
{VW} -k -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0 --holdout_off
    train-sets/ref/zero.stdout
    train-sets/ref/zero.stderr

# Test 16: LBFGS early termination
{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic -b 20 --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off
    train-sets/ref/rcv1_small.stdout
    train-sets/ref/rcv1_small.stderr

# Test 17: Run LDA with 100 topics on 1000 Wikipedia articles
{LDA} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -l 1 -b 13 --minibatch 128 train-sets/wiki256.dat
    train-sets/ref/wiki1K.stderr

# Test 18: Run search on seq_small for 12 passes, 4 passes per policy
{VW} -k -c -d train-sets/seq_small --passes 12 --invariant --search 4 --search_task sequence --holdout_off
    train-sets/ref/search_small.stderr

# Test 19: neural network 3-parity with 2 hidden units
{VW} -k -c -d train-sets/3parity --hash all --passes 3000 -b 16 --nn 2 -l 10 --invariant -f models/0021.model --random_seed 15 --quiet --holdout_off
    train-sets/ref/3parity.stderr

# Test 20: neural network 3-parity with 2 hidden units (predict)
{VW} -d train-sets/3parity --hash all -t -i models/0021.model -p 0022.predict.tmp
    pred-sets/ref/0022.stderr
    pred-sets/ref/0022.predict

# Test 21: cubic features -- on a parity test case
{VW} -k -c -f models/xxor.model train-sets/xxor.dat --cubic abc --passes 100 --holdout_off --progress 1.33333
    train-sets/ref/xxor.stderr

# Test 22: matrix factorization -- training
{VW} -k -d train-sets/ml100k_small_train -b 16 -q ui --rank 10 --l2 2e-6 --learning_rate 0.05 --passes 2 --decay_learning_rate 0.97 --power_t 0 -f models/movielens.reg -c --loss_function classic --holdout_off
    train-sets/ref/ml100k_small.stdout
    train-sets/ref/ml100k_small.stderr

# Test 23: matrix factorization -- testing
{VW} -i models/movielens.reg -t -d test-sets/ml100k_small_test
    test-sets/ref/ml100k_small.stdout
    test-sets/ref/ml100k_small.stderr

# Test 24: active-learning -- training
{VW} -k --active --simulation --mellowness 0.000001 -d train-sets/rcv1_small.dat -l 10 --initial_t 10 
    train-sets/ref/active-simulation.t24.stderr

# Test 25: bagging -- training regressor
{VW} -k -d train-sets/0002.dat -f models/bs.reg.model --bootstrap 4 -p bs.reg.predict
    train-sets/ref/bs.reg.stderr
    train-sets/ref/bs.reg.predict

# Test 26: bagging -- predicting with bagged regressor
{VW} -d train-sets/0002.dat -i models/bs.reg.model -p bs.prreg.predict -t
    train-sets/ref/bs.prreg.stderr
    train-sets/ref/bs.prreg.predict

# Test 27: bagging -- binary classifiers
{VW} -d train-sets/0001.dat -f models/bs.vote.model --bootstrap 4 --bs_type vote -p bs.vote.predict
    train-sets/ref/bs.vote.stderr
    train-sets/ref/bs.vote.predict

# Test 28: bagging -- predict with bagged classifier
{VW} -d train-sets/0001.dat -i models/bs.vote.model -p bs.prvote.predict -t
    train-sets/ref/bs.prvote.stderr
    train-sets/ref/bs.prvote.predict

# Test 29: affix features
{VW} -d train-sets/affix_test.dat -k -c --passes 10 --holdout_off --affix -2
    train-sets/ref/affix_test.stderr

# Test 30: train --l1 regularized model
{VW} -d train-sets/0001.dat -f models/mask.model --invert_hash mask.predict --l1 0.01
    train-sets/ref/mask.stderr

# Test 31: train model using --feature_mask
{VW} -d train-sets/0001.dat --invert_hash remask.predict --feature_mask models/mask.model -f models/remask.model
    train-sets/ref/remask.stderr

# Test 32: train model using --feature_mask and --initial_regressor
{VW} -d train-sets/0001.dat --feature_mask models/mask.model -i models/remask.model
    train-sets/ref/remask.final.stderr

# Test 33: train model for topk recommender
{VW} -d train-sets/topk.vw -f topk.model -q MF --passes 100 --cache_file topk-train.cache -k --holdout_off
    train-sets/ref/topk-train.stderr

# Test 34: train model for topk recommender
{VW} -P 1 -d train-sets/topk.vw -i topk.model --top 2 -p topk.predict
    train-sets/ref/topk-rec.stderr
    train-sets/ref/topk-rec.predict

# Test 35: non-centered data-set where constant >> 0
#   To test the new --constant option without which performance is very weak
{VW} -k --passes 100 -c --holdout_off --constant 1000 -d train-sets/big-constant.dat
    train-sets/ref/big-constant.stderr

# Test 36: new option: --progress w/ integer arg
{VW} -k -d train-sets/0001.dat --progress 10
    train-sets/ref/progress-10.stderr

# Test 37: new-option: --progress w/ floating-point arg
#           + alternate short form (-P)
{VW} -k -d train-sets/0001.dat -P 0.5
    train-sets/ref/progress-0.5.stderr

# Test 38: --nn without --quiet to avoid nn regressions
#   (Needs to be a simple test, not one sensitive to symmetry breaking)
{VW} -k -d train-sets/0001.dat --nn 1
    train-sets/ref/nn-1-noquiet.stderr

# Test 39: cb with dr
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25
    train-sets/ref/rcv1_raw_cb_dr.stderr

# Test 40: cb with ips
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type ips --ngram 2 --skips 4 -b 24 -l 0.125
    train-sets/ref/rcv1_raw_cb_ips.stderr

# Test 41: cb with dm
{VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dm --ngram 2 --skips 4 -b 24 -l 0.125
    train-sets/ref/rcv1_raw_cb_dm.stderr

# Test 42: --lda --passes 2 hang regression
{VW} -k -d train-sets/lda-2pass-hang.dat --lda 10 -c --passes 2 --holdout_off
    train-sets/ref/lda-2pass-hang.stderr

# Test 43: search sequence labeling, non-ldf train
{VW} -k -c -d train-sets/sequence_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequence --search 5 --holdout_off -f models/sequence_data.model
    train-sets/ref/sequence_data.nonldf.train.stderr

# Test 44: search sequence labeling, non-ldf test
{VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict
    train-sets/ref/sequence_data.nonldf.test.stderr
    train-sets/ref/sequence_data.nonldf.test.predict

# Test 45: make sure that history works
{VW} -k -c -d train-sets/seq_small2 --passes 4 --search 4 --search_task sequence --holdout_off
    train-sets/ref/search_small2.stderr

# Test 46: search sequence labeling, ldf train
{VW} -k -c -d train-sets/sequence_data --passes 20 --search_rollout oracle --search_alpha 1e-8 --search_task sequence_demoldf --csoaa_ldf m --search 5 --holdout_off -f models/sequence_data.ldf.model
    train-sets/ref/sequence_data.ldf.train.stderr

# Test 47: search sequence labeling, ldf test
{VW} -d train-sets/sequence_data -t -i models/sequence_data.ldf.model -p sequence_data.predict
    train-sets/ref/sequence_data.ldf.test.stderr
    train-sets/ref/sequence_data.ldf.test.predict

# Test 48: search sequence SPAN labeling BIO, non-ldf train, no rollouts
{VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant --search_rollout none --search_task sequencespan --search 7 --holdout_off -f models/sequencespan_data.model
    train-sets/ref/sequencespan_data.nonldf.train.stderr

# Test 49: search sequence SPAN labeling BIO, non-ldf test
{VW} -d train-sets/sequencespan_data -t -i models/sequencespan_data.model -p sequencespan_data.predict
    train-sets/ref/sequencespan_data.nonldf.test.stderr
    train-sets/ref/sequencespan_data.nonldf.test.predict

# Test 50: search sequence SPAN labeling BILOU, non-ldf train
{VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequencespan --search_span_bilou --search 7 --holdout_off -f models/sequencespan_data.model
    train-sets/ref/sequencespan_data.nonldf-bilou.train.stderr

# Test 51: search sequence SPAN labeling BILOU, non-ldf test
{VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict
    train-sets/ref/sequencespan_data.nonldf-bilou.test.stderr
    train-sets/ref/sequencespan_data.nonldf-bilou.test.predict

# Test 52: silly test for "argmax" task
{VW} -d train-sets/argmax_data -k -c --passes 20 --search_rollout oracle --search_alpha 1e-8 --search_task argmax --search 2 --holdout_off
    train-sets/ref/argmax_data.stderr

# Test 53: (holdout-broken regression)
# ensure we have no holdout loss of '0 h'
{VW} -k -c --passes 2 train-sets/0001.dat
    train-sets/ref/holdout-loss-not-zero.stderr

# Test 54: stagewise poly with exponent 0.25
####in the following stage_poly tests, there are minute differences in losses, which are not being fuzzy-diffed;
####thus the stderr is cleared (--quiet) and only comparing (fuzzy-diffed) predictions.
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat -p stage_poly.s025.predict --quiet
    train-sets/ref/stage_poly.s025.stderr
    train-sets/ref/stage_poly.s025.predict

# Test 55: stagewise poly with exponent 1.0
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat --quiet
    train-sets/ref/stage_poly.s100.stderr

# Test 56: stagewise poly with exponent 0.25 and doubling batches
{VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s025.doubling.predict --quiet
    train-sets/ref/stage_poly.s025.doubling.stderr
    train-sets/ref/stage_poly.s025.doubling.predict

# Test 57: stagewise poly with exponent 1.0 and doubling batches
{VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s100.doubling.predict --quiet
    train-sets/ref/stage_poly.s100.doubling.stderr
    train-sets/ref/stage_poly.s100.doubling.predict

# Test 58: library test, train the initial model
{VW} -c -k -d train-sets/library_train -f models/library_train.w -q st --passes 100 --hash all --noconstant --csoaa_ldf m --holdout_off
    train-sets/ref/library_train.stdout
    train-sets/ref/library_train.stderr

# Test 59: library test, run ezexample_predict
../library/ezexample_predict models/library_train.w
    train-sets/ref/ezexample_predict.stdout
    train-sets/ref/ezexample_predict.stderr

# Test 60: empty test, bad builds (without make clean)
# sometimes cause a SEGV even on empty input
{VW} /dev/null
    train-sets/ref/empty-set.stderr

# Test 61: daemon test
./daemon-test.sh
    test-sets/ref/vw-daemon.stdout

# Test 62: SVM linear kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 -p linear.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.linear.stderr
    train-sets/ref/ksvm_train.linear.predict

# Test 63: SVM polynomial kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel poly -p ksvm_train.poly.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.poly.stderr
    train-sets/ref/ksvm_train.poly.predict

# Test 64: SVM rbf kernel
{VW} --ksvm --l2 1 --reprocess 5 -b 18 --kernel rbf -p ksvm_train.rbf.predict -d train-sets/rcv1_smaller.dat
    train-sets/ref/ksvm_train.rbf.stderr
    train-sets/ref/ksvm_train.rbf.predict

# Test 65: Run search (dagger) on an entity-relation recognitions data set, er_small, for 6 passes with constraints
{VW} -k -c -d train-sets/er_small.vw --passes 6 --search_task entity_relation --search 10 --constraints --search_alpha 1e-8 
    train-sets/ref/search_er.stderr

# Test 66: Train a depenency parser with search (dagger) on wsj_small.dparser.vw.gz for 2 passes
{VW} -k -c -d train-sets/wsj_small.dparser.vw.gz --passes 2 --search_task dep_parser --search 3  --search_alpha 1e-4 --search_rollout oracle --holdout_off
    train-sets/ref/search_dep_parser.stderr

# Test 67: classification with data from dictionaries (eg embeddings or gazetteers) -- note that this is impossible without dictionaries because --ignore w
{VW} -k -c -d train-sets/dictionary_test.dat --binary --ignore w --holdout_off --passes 32 --dictionary w:train-sets/dictionary_test.dict
    train-sets/ref/dictionary_test.stderr

# Test 68: Search for multiclass classification
{VW} -k -c -d train-sets/multiclass.sch --passes 20 --search_task multiclasstask --search 10 --search_alpha 1e-4 --holdout_off
	train-sets/ref/search_multiclass.stderr

# Test 69: (see Test 43/Test 44): search sequence labeling, with beam
{VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data_beam.predict --search_beam 10 --search_kbest 10
    train-sets/ref/sequence_data.nonldf.beam.test.stderr
    train-sets/ref/sequence_data.nonldf.beam.test.predict

# Test 70: (see Test 46/47) search sequence labeling, ldf test, with beam
{VW} -d train-sets/sequence_data -t -i models/sequence_data.ldf.model -p sequence_data_beam.predict --search_beam 10 --search_kbest 10
    train-sets/ref/sequence_data.ldf.beam.test.stderr
    train-sets/ref/sequence_data.ldf.beam.test.predict

# Test 71: autolink
{VW} -d train-sets/0002.dat --autolink 1 --examples 100 -p 0002.autolink.predict
    train-sets/ref/0002.autolink.stderr
    train-sets/ref/0002.autolink.predict
