#!/usr/bin/perl -w

# Generate any number of cross validation sets from an annotation set.
# Assigns each gene to one of the sets and outputs a training set and
# a testing set.  The testing set includes one of the four sets of genes
# and the testing set includes the other n-1 genes.
# Leaves the training and testing sets in the directory supplied by out_dir.

use strict;

my $usage = <<END 
Usage: $0 num out [-r|-e] (gtf_file|gtf_dir) [ [-r|-e] (gtf_file|gtf_dir) ... ]
    Creates symbolic links to the files for training and testing
    Input a number of sets for training and testing (num)
    Input an output directory for the sets (out)
    If the -r flag is used before a gtf file or directory, only use for tRaining
    If the -e flag is used before a gtf file or directory, only use for tEsting
    All exclusive files will be included in all training and testing sets.
    There must be at least one gtf file or directory that is used for both 
    training and testing.
END
;

@ARGV > 2 or die $usage;
my $num_sets = shift @ARGV;
die $usage."Must have at least 2 cross validation sets for num_sets" 
    if($num_sets < 2);
my $out_base = shift @ARGV;

my $trn_dir = "$out_base/training";
my $tst_dir = "$out_base/testing";

if(! -d "$trn_dir") {
    mkdir "$trn_dir" or die "Could not make training directory.\n";
    for (my $i=0; $i < $num_sets; $i++) { mkdir "$trn_dir/set$i"; }
}
if(! -d "$tst_dir") {
    mkdir "$tst_dir" or die "Could not make testing directory.\n";
    for (my $i=0; $i < $num_sets; $i++) { mkdir "$tst_dir/set$i"; }
}

my (@anns, @trn_only, @tst_only);

while(@ARGV) {
    my $opt = shift @ARGV;
    if($opt =~ /^-/) {
        my $file = shift @ARGV;
        my @cur_files = get_file_or_dir($file,\*REPORT);
        if($opt eq "-r")    { push @trn_only, @cur_files }
        elsif($opt eq "-e") { push @tst_only, @cur_files }
        else                { die $usage                  }
    }
    else {
        push @anns, get_file_or_dir($opt);
    }
}

my @tst_sets;

for my $set (0 .. ($num_sets-1)) {
    $tst_sets[$set] = [];
}

for my $ann (@anns) {
    my $set = int rand $num_sets;
    push @{$tst_sets[$set]}, $ann;
}

open REPORT, ">$out_base/cv_sets" 
    or die "Could not open $out_base/cv_sets for writing.\n";
print REPORT "Cross validation sets created on ".scalar(localtime)."\n".
        "Testing/training sets\n";
for (my $i = 0; $i < $num_sets; $i++) {
    print REPORT "Set $i:\n";
    for my $ann (@{$tst_sets[$i]}) {
        print REPORT "$ann\n";
        my $short_name = $ann;
        $short_name =~ s/.*\///;
        symlink ($ann, "$tst_dir/set$i/$short_name");
        for(my $j = 0; $j < $num_sets; $j++) {
            next if $i == $j;
            symlink($ann, "$trn_dir/set$j/$short_name");
        }
    }
    for my $tst_ann (@tst_only) {
        my $short_name = $tst_ann;
        $short_name =~ s/.*\///;
        symlink($tst_ann, "$tst_dir/set$i/$short_name");
    }
    for my $trn_ann (@trn_only) {
        my $short_name = $trn_ann;
        $short_name =~ s/.*\///;
        symlink($trn_ann, "$trn_dir/set$i/$short_name");
    }
}
print REPORT "Testing only annotations:\n";
for my $tst_ann (@tst_only) { print REPORT "$tst_ann\n"; }
print REPORT "Training only annotations:\n";
for my $trn_ann (@trn_only) { print REPORT "$trn_ann\n"; }
close REPORT;

sub get_file_or_dir {
    my ($file) = shift;
    my @files;
    if(-d $file)    { push @files, glob("$file/*"); }
    elsif(-f $file) { push @files, $file; }
    else            { die "$0: $file: no such file or directory\n" }
    return @files;
}
