#! /usr/bin/perl -w

use XML::LibXML;
use Getopt::Long;
use File::Basename;
use strict;

my $WIDTH = 80;

my $PROGNAME=fileparse($0);
my $usage = "$PROGNAME [options] gHMM-file

Usually, $PROGNAME -d -s -b gHMM-file.

Fixes an old-style gHMM file and converts it to the new style.
Changes include:
  1.  All sequence models are under one element, <sequence_models>,
      rather than <conservation_models>, <est_models>, etc.
  2.  All data attributes in string_models have a key=value pair
      format, rather than the opaque numbers-and-symbols only
      format.
  3.  BNTREE* models are no longer hacks into LUT, WAM and CDS,
      but are instead models unto themselves.  All BNTREE models
      must be moved from the zoe_model field to the model field.
Options:
  -w num  width of lines to print out
  -s      convert sequence_models tag (see 1.)
  -d      convert data attributes (see 2.)
  -b      convert BNTREE zoe_model attributes (see 3.)
";

my ($sequence_fix, $data_fix, $bntree_fix);
GetOptions("width=i"=>\$WIDTH,"data-fix"=>\$data_fix,
  "bntree-fix"=>\$bntree_fix,"sequence-fix"=>\$sequence_fix) or die($usage);
@ARGV == 1 or die($usage);

my $parser = new XML::LibXML;
my $ghmm = $parser->parse_file($ARGV[0]);
my $newghmm = $ghmm->cloneNode(1);
my $root = $newghmm->documentElement();

die "$ARGV[0] is not an gHMM file.\n" 
  unless($root->nodeName eq "gHMM");

if($sequence_fix) {
  my($sm_n, $sm_lc);
  for my $child($root->childNodes) {
    if($child->nodeName eq "sequence_models") {
      $sm_n = $child;
      $sm_n->insertBefore(new XML::LibXML::Comment("\nDNA models\n"),
        ${$sm_n->childNodes}[0]);
    }
    elsif($child->nodeName =~ m/^(\w+)_models/) {
      $sm_n->appendChild(
        new XML::LibXML::Comment("\n".ucfirst($1)." models\n"));
      for my $gc ($child->childNodes) {
        $sm_n->appendChild($gc) if($gc->nodeName =~ m/string_model/);
        $sm_lc = $gc;
      }
      $root->removeChild($child);
    }
  }
}

if($data_fix){ 
  for my $child($root->childNodes()) {
    if($child->nodeName =~ m/models$/) {
      for my $gchild($child->childNodes()) {
        fix_data_attr($gchild);
      }
    }
  }
}

if($bntree_fix) {
  for my $child($root->childNodes()) {
    if($child->nodeName =~ m/models$/) {
      for my $gchild($child->childNodes()) {
        my($zma, $ma);
        for my $a($gchild->attributes()) {
          $zma = $a if(defined($a) && $a->name =~ m/zoe_model/);
        }
        if($zma) {
          my $v = $zma->value();
          if($v =~ m/BNTREE/) {
            $gchild->removeAttribute("zoe_model");
            $gchild->setAttribute("model", $v);
          }
        }
      }
    }
  }
}

my $str = $newghmm->toString(2);
for my $line(split("\n", $str)) {
  next unless($line =~ s/^(\s*)(\S)/$2/);
  my $ind = $1;
  my $sl = $ind;
  while($line =~ s/(^[A-Za-z]+="[^"]+")// || $line =~ s/(^\S+)//) {
    my $w = $1;
    $line =~ s/^\s*(\S)/$1/;
    if(length($sl) + length($w) >= $WIDTH && $sl =~ m/\S/) {
      print("$sl\n");
      $sl = "$ind  $w ";
    }
    else {
      $sl .= "$w ";
    }
  }
  print("$sl\n");
}

#print $str;
#exit;

sub fix_data_attr {
  my ($n) = @_;
  return unless($n->nodeName =~ m/model$/);
  for my $c ($n->childNodes) {
    fix_data_attr($c);
  }
  my ($t,$da,$name);
  for my $a ($n->attributes) {
    $t = $a->value if($a->nodeName eq "model");
    $da = $a if($a->nodeName eq "data");
    $name = $a->value if($a->nodeName eq "name");
  }
  return unless(defined($da));
  { no strict 'refs'; my $meth="fix_$t"; &{$meth}($t, $da, $name); }
}


sub fix_ASM { fix_order(@_) }
sub fix_CDS { fix_order(@_) }
sub fix_ISO { 
 $_[1]->setValue("ISOs=".join(",", split(' ',$_[1]->value))); 
}
sub fix_JUMPLUT { fix_order(@_, "jumpLen") }
sub fix_LUT { fix_order(@_) }
sub fix_MARG_WAM { fix_order(@_, "printedOrder") }
sub fix_SDT { }
sub fix_SIG { }
sub fix_SPLIT { $_[1]->setValue("zoeHeaderEnd=".$_[1]->value) }
sub fix_WAM { fix_order(@_) }
sub fix_WMM { }
sub fix_WWAM { 
  my ($type, $attr, $name) = @_;
  warn("A WWAM model was found with the name $name.\n".
    "These are typically not used.  Consider changing it to a WAM.\n");
  fix_order(@_, "windowRadius", 1) 
}
sub fix_ELUT { fix_order(@_, "footprint") }

sub fix_order {
  my ($type, $attr, $name, $extra, $dontwarn) = @_;
  $dontwarn ||= 0;
  my $v = $attr->value;
  my $str = "";
  if($v =~ m/^(\d+)\/(\d+)/) {
    $str = "order=$1 targetOrder=$2";
  }
  elsif($v =~ m/^(\d+)/) {
    $str = "order=$1";
  }
  else {
    die("Odd data str $v\n");
  }

  if($extra) {
    my $ev = (split(' ',$v))[1];
    if(defined($ev)) {
      $str .= " $extra=$ev";
    } else {
      warn("$type expects a $extra after the first data parameter. Skipping\n")
        unless($dontwarn);
    }
  }
  $attr->setValue($str);
}
