#!/usr/bin/perl -w

# Generate any number of cross validation sets from an annotation set.
# Assigns each gene to one of the sets and outputs a training set and
# a testing set.  The testing set includes one of the four sets of genes
# and the testing set includes the other n-1 genes.
# Leaves the training and testing sets in the directory supplied by out_dir.

use GTF;
use File::Basename;
use Getopt::Long;
use strict;

my $usage = "Usage: $0 [options] num_sets out_dir gtf_file [ gtf_file ... ]
    Generate cross validation sets from a group of GTF files.
    Each file is opened and genes are chosen at random to be placed in
    num_sets cross validation sets.  These sets become the 'testing'
    set and are left in files (with the same names as the input files)
    in out_dir/testing under num_sets subdirectories, named set_x where x
    is the set ordinal.  All other genes not in the testing set are left
    in out_dir/training.

Options:
  -f    do not generate directories, use a flat heirarchy.
  -h    display this help message and exit.
";
my %opts;
GetOptions(\%opts, "flat", "helpme") or die($usage);
$opts{helpme} and die($usage);
scalar(@ARGV) > 2 or die $usage;

my $num_sets = shift @ARGV;
die $usage."Must have at least 2 cross validation sets for num_sets" 
    if($num_sets < 2);
my $out_base = shift @ARGV;

my ($trn_dir, $tst_dir);
if($opts{flat}) {
  $trn_dir = $out_base;
  $tst_dir = $out_base;
}
else {
  $trn_dir = "$out_base/training";
  $tst_dir = "$out_base/testing";

  if(! -d "$trn_dir") {
      mkdir "$trn_dir" or die "Could not make training directory.\n";
      for (my $i=0; $i < $num_sets; $i++) { mkdir "$trn_dir/set$i"; }
  }
  if(! -d "$tst_dir") {
      mkdir "$tst_dir" or die "Could not make testing directory.\n";
      for (my $i=0; $i < $num_sets; $i++) { mkdir "$tst_dir/set$i"; }
  }
}

my @trn_sets;
my @tst_sets;

for my $gtf_file (@ARGV) {
    @trn_sets = ();
    @tst_sets = ();

    my $base_name = fileparse($gtf_file, '\.gtf');

    for (my $i = 0; $i < $num_sets; $i++) {
        my ($trn_file,$tst_file);
        if($opts{flat}) {
          $trn_file = "$trn_dir/$base_name.training.set$i.gtf"; 
          $tst_file = "$tst_dir/$base_name.testing.set$i.gtf"; 
        }
        else{
          $trn_file = "$trn_dir/set$i/$base_name.gtf"; 
          $tst_file = "$tst_dir/set$i/$base_name.gtf"; 
        }
        open $trn_sets[$i],">$trn_file" or die "Could not write to $trn_file\n";
        open $tst_sets[$i],">$tst_file" or die "Could not write to $tst_file\n";
    }

    print STDERR "$gtf_file: Reading ... ";
    my $cur_gtf = GTF::new({gtf_filename => $gtf_file, no_check => 1});
    print STDERR "Computing sets ... ";
    my $genes = $cur_gtf->genes;
    for my $gene (@$genes) {
        my $set = int rand $num_sets;
        $gene->output_gtf($tst_sets[$set]);
        for (my $i = 0; $i < $num_sets; $i++) {
            next if ($i == $set);
            $gene->output_gtf($trn_sets[$i]);
        }
    }

    for (my $i = 0; $i < $num_sets; $i++) {
        close $trn_sets[$i];
        close $tst_sets[$i];
    }
    print STDERR "\n";
}
