=head1 NAME

iPE::Util::DNATools - A set of functions to analyze DNA sequences.

=head1 DESCRIPTION

A set of functions which do various things to extract information about or perform transformations on DNA sequences.

=head1 FUNCTIONS

=over 8

=cut

package iPE::Util::DNATools;
use iPE;
use base ("Exporter", "DynaLoader");

our @EXPORT = qw(reverseComplement gcContent levels 
                 rcCoords getOpposingFrame );

use strict;

=item reverseComplement (stringRef)

Return the reverse complement of a DNA sequence including upper- and lower-case ambiguity codes.  Only references are passed in and out to prevent overflow of memory for large sequences.

Note that S's and W's remain the same since they represent complements.

=cut

sub reverseComplement {
    my $seqRef = shift;
    my $rc = reverse($$seqRef);
    $rc =~ tr[ACGTRYMKSWBDHVNacgtrymkswbdhvn]
             [TGCAYRKMSWVHDBNtgcayrkmswvhdbn];

    return \$rc;
}

=item rcCoords (high, low, length)

Returns an array containing the low and high coordinates of a region as it would be on the reverse complement sequence on a 0-based index.

=cut
sub rcCoords {
    my $length = pop;
    my @coords = sort { $b <=> $a } @_;

    $coords[0] = $length - $coords[0] - 1;
    $coords[1] = $length - $coords[1] - 1;

    return @coords;
}

=item getOpposingFrame(start, end, frame)

Infers the frame at the other end of the feature given its coordinates and opposing frame.  Specifically, given the number of overhanging bases on one end of the (CDS) feature, get the number of overhanging bases on the other end of the feature.

=cut
# from sam:
      #on the plus strand a state's frame is equal to the number
      #of overhanging bases at the END of the CURRENT state
       #--
      #on the minus strand a state's frame is equal to the number of
      #overhanging bases at the BEGINNING of the NEXT state
sub getOpposingFrame {
    my ($start, $end, $endFrame) = @_;
    return ($end-$start-$endFrame+1)%3;
}

#=item getZoeFrame(start, end, strand, endFrame)

#This function finds the frame as used by the Zoe codebase.  Frame is defined as being the first 3' overhang starting at the end of the current features.  Specifically, if the feature is on the plus strand, it is the number of bases overhanging at the highest coordinate of the feature.  If the feature is on the minus strand, it is the number of bases overhanging the lowest coordinate of the next feature.
#
#You must pass the strand of the feature, and end frame (the number of bases overhanging the end of the feature).
#
#=cut
#sub getZoeFrame {
    #my ($start, $end, $strand, $endFrame) = @_;
    #
    #if($strand eq '+')  { return $endFrame          }
    #else                { return (3-$endFrame)%3    }
#}

=item gcContent

Return the GC% of a sequence which may include ambiguity codes.

=cut

sub gcContent {
    my $seq = shift;
    my $levels = levels($seq);
    return($levels->{C} + $levels->{G});
}

=item levels (sequence)

Return a hash reference to the levels of each of the canonical DNA codes, A, C, G, and T.  The sequence can include ambiguity codes and the levels are appropriately measured.

=cut

sub levels {
    my $seq = shift;
    my $levels = {};

    _levels($levels, $seq, length($seq));
    return $levels;

# perl version too slow.
    #for my $i (0 .. (length $seq)-1) {
                #$_ = substr $seq, $i, 1;
    #my %levels;
    #for(my $i = 0; defined($_ = _getchar($seq, length($seq), $i)); $i++) {
        #$_ = _getchar2($seq, $i);
    #while($seq =~ m/(.)/g) {
        #$_ = $1;
        #print $_;
        #if($_ eq "A")    { $levels{$_}++; } #/[ACGT]/)
        #elsif($_ eq "C")    { $levels{$_}++; } #/[ACGT]/)
        #elsif($_ eq "G")    { $levels{$_}++; } #/[ACGT]/)
        #elsif($_ eq "T")    { $levels{$_}++; } #/[ACGT]/)
        #elsif($_ eq "N")    { $levels{A} += .25; $levels{C} += .25; 
        #                         $levels{G} += .25; $levels{T} += .25; }
        #if(/[ACGT]/)    { $levels{$_}++; }
        #elsif(/N/)      { $levels{A} += .25; $levels{C} += .25; 
                                 #$levels{G} += .25; $levels{T} += .25; }
        #/R/             and do { $levels{A} += .5; $levels{G} += .5; next; };
        #/Y/             and do { $levels{C} += .5; $levels{T} += .5; next; };
        #/M/             and do { $levels{A} += .5; $levels{C} += .5; next; };
        #/K/             and do { $levels{G} += .5; $levels{T} += .5; next; };
        #/W/             and do { $levels{A} += .5; $levels{T} += .5; next; };
        #/S/             and do { $levels{C} += .5; $levels{G} += .5; next; };
        #/B/             and do { $levels{C} += 1/3; $levels{G} += 1/3; 
                                             #$levels{T} += 1/3; next; };
        #/D/             and do { $levels{A} += 1/3; $levels{G} += 1/3; 
                                             #$levels{T} += 1/3; next; };
        #/H/             and do { $levels{A} += 1/3; $levels{C} += 1/3; 
                                             #$levels{T} += 1/3; next; };
        #/V/             and do { $levels{A} += 1/3; $levels{C} += 1/3; 
                                             #$levels{G} += 1/3; next; };
    #}
    #foreach my $letter (keys %levels)
            #{ $levels{$letter} /= length $seq; }

    #return \%levels;
}

=item fast_uc (stringref, length) 

This function is a fast, memory efficient version of the uc routine.

=back

=head1 SEE ALSO

L<iPE>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu)

=cut

1;
