#!/usr/bin/perl -w

use Getopt::Long;
use iPE;
use iPE::Globals;
use iPE::Instance;
use iPE::gHMM;
use strict;

# model type
# This is hard-coded because I don't think many users will want to change this. 
# If someone knows enough to want to change the model type, 
# they will know enough to look into the code. 
my $MODEL_TYPE = "R1";
my $TRAIN_COMMAND = "/bio/home/bl/rhb/bin/bntree/train ";
my $TRAIN_JOBFILE   = "train_nscan";
my $NOTRAIN_JOBFILE = "sew_params";

my $usage = "$0 [-r] [-n] [-c <train program>] [-m <model type>]
                <instance>         the instance file used to estimate parameters
                <phylo tree>
                <new zhmm name>    name of the new complete zhmm file
                <target genome>
                <list of informant genomes in order>

Options:
 -d <outdir>   the source of the .ss files (if different than in instance file)
 -z <zhmmfile> the path and name of original zhmm file (if differs from inst)
 -r            execute training remotely on the queue
 -n            do not train, just combine phylogenetic parameters into one file
 -c <pgm>      use an alternate training program (default: $TRAIN_COMMAND)
 -m <model>    use an alternate model type for nscan (default: $MODEL_TYPE)

Example of phylogenetic tree

	\"[[galGal2:4,[mm5:2,rn3:3]],none]\"

Example of tree root

	\"hg17\"

In this example the tree is re-rooted at the target which, in this case, 
is human. The target is not listed explicitly in the tree; it is listed 
separately. The informants are chicken (galGal2), mouse (mm5), and rat (rn3). 
The numbers following the species names and colon refer to the order in which 
the each species occurs in the alignment file. For example, mouse - second, 
rat - third, and chicken - fourth.\n";

my $options = new iPE::Options;
my $g = new iPE::Globals;
$g->options($options);
$options->verbose(0);
$options->supressWarnings(1);
$options->debugOutput(0);
$g->init_fhs();

my ($remoteflag, $train, $notrainflag, $model_type, 
    $output_dir, $old_zhmm_file);
GetOptions("dir=s" => \$output_dir,
           "zhmm=s"   => \$old_zhmm_file,
           "remote" => \$remoteflag, 
           "command=s" => \$train, 
           "notrain" => \$notrainflag,
           "model=s"   => \$model_type);
$model_type = $MODEL_TYPE unless(defined($model_type) && length($model_type));
$train = $TRAIN_COMMAND   unless(defined ($train) && length($train));
die $usage unless @ARGV >= 5; 

# parse the command line parameters
my $n = 0;
my $instance_file   = $ARGV[$n++];
my $phylo_tree      = $ARGV[$n++];
my $nscan_name      = $ARGV[$n++];
my $root            = $ARGV[$n++];

check_phylo_tree_format($phylo_tree);

my @informant_list;
while($n < scalar(@ARGV)) {
    push @informant_list, $ARGV[$n++];
}

my $instance = new iPE::Instance($instance_file);
$instance->parse_file($instance_file, "iPE_instance");
$instance->options->verbose(0);
$instance->options->debugOutputFile(undef);

unless(defined($output_dir)) {
    $output_dir = $instance->options->outputBaseDir;
}
unless(defined($old_zhmm_file)) {
    $old_zhmm_file = $instance->options->outputBaseDir."/".
        $instance->options->zoeOutputFile;
}

my $gHMM = new iPE::gHMM($instance->gHMMFile);
$gHMM->parse_file($instance->gHMMFile, "gHMM");
my $emis = $gHMM->malignEmissions;
$instance = undef;
$gHMM = undef;

my $num_informants = scalar(@informant_list);

# The bulk of the code:
# First, we go through running sam's train program.  If we are running this
# locally, we will go straight through sewing the new parameter file together.
# Otherwise, we enqueue this same command line as a queue job, with the -n flag
# to skip the training step.
unless ($notrainflag) {
    print STDERR "[ Training ... ";
    if(!train_nscan($remoteflag, $train, $phylo_tree, $model_type, 
                    $output_dir, $num_informants)) {
        exit 1; 
    }
    print STDERR "done ]\n";
}
if($remoteflag) {
    print STDERR "[ Enqueuing the parameter file sewing step ... ";
    enqueue_notrain_job($train, $model_type, $output_dir, @ARGV);
    print STDERR "done ]\n";
}
else {
    print STDERR "[ Incorporating phylogenetic parameters ... ";
    my $phylo = get_phylo_section($instance_file, $output_dir);
    inject_phylo($old_zhmm_file, $phylo, $output_dir, $nscan_name);
    print STDERR "done ]\n";
}


# Find all the ss files in the directory and convert them to bntree files
# using sam's train utility included in the zoe code base.
sub train_nscan {
    my ($remoteflag, $train, $phylo_tree, $model_type, $output_dir, $n) = @_;

    opendir(SS_DIR, $output_dir) 
        or die "Could not open directory $output_dir\n";
    my @ss_files = grep(/\.ss$/, readdir(SS_DIR));
    closedir (SS_DIR);

    die "No ss files found in $output_dir.\n" if (scalar(@ss_files) == 0);

    my ($command);
    my $script = "$output_dir/$TRAIN_JOBFILE";
    if($remoteflag) {
        open (SCR, ">$script")
            or die "Could not open file $output_dir/train_nscan for writing\n";
    }

    for my $ss_file (@ss_files) {
        my ($model) = $ss_file =~ m/(\S+)\.ss/;
        $command = "$train $phylo_tree $model_type $output_dir/$ss_file ".
            "$model $output_dir/$model.bntree";
        print SCR "$command\n"          if( $remoteflag);
        if(!$remoteflag) {
            my $err = system("$command 2> /dev/null");
            if($err) {
                print STDERR "\nError encountered, rerunning command $command\n";
                system($command);
                return 0;
            }
        }
    }
    
    if($remoteflag) {
        close SCR;
        my $groups = 1;
        # if we only have one informant, it's silly to put this in several
        # jobs since the training takes less than a minute.
        $groups = scalar(@ss_files) if($n == 1);
        $command = "nq -m 4096 -g $groups $script > /dev/null 2>&1\n";
        system($command);
    }
    return 1;
}

sub get_phylo_section {
    my ($instance_file, $param_dir) = @_;

    my $ps = "<PHYLOGENETIC_MODELS>\n\n";

    my ($i, $file, $model_name);
    for my $model (@$emis) {
        $model_type = $model->zModel;
        $model_name = $model->name;

        die "\nThis script expects the gHMM file associated with the \n".
            "instance file to zoe_model attributes set in all the \n".
            "phylogenetic_modelsto the BNTREE name (BNTREE, BNTREE_ARRAY, \n".
            "BNTREE_CDS) that the model is.\n"
            if(!defined $model_type);

        if($model_type eq "BNTREE_ARRAY") {
            $ps .= "$model_name\t$model_type\t".$model->targetOrder."\t".
                $model->zLength."\t".$model->zFocus."\n";
            for($i = 0; $i < $model->zLength; $i++) {
                $file = "$param_dir/".$model_name."_$i.bntree";
                open BT, "$file" or die "Could not open file $file to read.\n";
                while(<BT>) { $ps .= $_ }
                close BT;
            }
        }
        elsif($model_type eq "BNTREE_CDS") {
            $ps .= "$model_name\t$model_type\t".$model->zLength."\n";
            for($i = 0; $i < 3; $i++) {
                $file = "$param_dir/".$model_name."_$i.bntree";
                open BT, "$file" or die "Could not open file $file to read.\n";
                while(<BT>) { $ps .= $_ }
                close BT;
            }
        }
        elsif($model_type eq "BNTREE") {
            $file = "$param_dir/$model_name.bntree";
            open BT, "$file" or die "Could not open file $file to read.\n";
            while(<BT>) { $ps .= $_ }
            close BT;
        }
        else {
            die "Unknown model type $model_type for $model_name.\n";
        }
        $ps .= "\n";
    }

    return $ps;
}

sub inject_phylo {
    my ($old_param_file, $phylo_section, $output_dir, $nscan_name) = @_;
    open OLD, "$old_param_file" 
        or die "Could not open $old_param_file to read.\n";
    open NEW, ">$output_dir/$nscan_name"
        or die "Could not open $output_dir/$nscan_name to write.\n";
    while(<OLD>) {
        if(m/<GTF_CONVERSION>/) {
            print NEW $phylo_section;
        }
        print NEW $_;
    }
    close NEW;
    close OLD;
}

sub enqueue_notrain_job {
    my($train, $model_type, $output_dir, @args) = @_;
    my $script = "$output_dir/$NOTRAIN_JOBFILE";
    open SCR, ">$script" or die "Could not open $script for writing.\n";
    print SCR "$0 -l -n -c $train -m $model_type ";
    print SCR "$_ " for(@args);
    print SCR "\n";
    close SCR;
    system("enqueue $script -mem 4096 --args -hold_jid $TRAIN_JOBFILE ".
        "> /dev/null 2>&1");
}

sub check_phylo_tree_format {
    my ($tree) = @_;

    my $errmsg = "$usage\n***Error in formatting of phylo tree.\n".
        "You entered \"$tree\".\nThe tree should have no spaces in it.\n";

    $tree =~ s/^[\s]*//;
    $tree =~ s/[\s]*$//;

    die($errmsg) if($tree =~ m/\s/);
    if($tree !~ m/^\[/ || $tree !~ m/\]$/) {
        $errmsg .= "It should begin with a [ and end with a ]\n";
        die $errmsg;
    }

    my $tree_formation_error = descend_phylo_tree(\$tree);
    die("$errmsg$tree_formation_error\n") if(length($tree_formation_error));

} 

sub descend_phylo_tree {
    my $tree = shift;

    $$tree =~ s/^\[//;

    my $msg;
    $msg = check_node($tree);
    return $msg if(length($msg));
    if($$tree !~ m/^,(.*)$/) {
        return "Comma appears to be missing before $$tree\n";
    }
    $$tree = $1;
    $msg = check_node($tree);
    return $msg if(length($msg));
    if($$tree !~ m/^\]/) {
        return "Tree appears unbalanced after $$tree.\n";
    }

    $$tree =~ s/^\]//;

    return "";
}

sub check_node {
    my $tree = shift;

    if($$tree =~ m/^\[/) {
        $$tree =~ s/^\[//;
        my $msg = descend_phylo_tree($tree);
        return $msg if(length($msg));
    }
    else {
        unless($$tree =~ m/^[^:]+:\d+(.*)$/ || $$tree =~ m/^none(.*)$/) {
            return "Bad node format: $$tree\n".
                "Should be species_name:alignment_pos, e.g. mm5:2\n";
        }
        $$tree = $1;
    }
    return "";
}
