#!/usr/bin/perl -w

use strict;

# if the number of input arguments is lower than 4
# return a message showing the error

if (scalar(@ARGV) < 3) {
  print "dnaloglkh.pl codontable DNAsequence frame\n";
  exit(1);
}

# Load the arguments into four vars 
my $filecodontable = $ARGV[0];
my $filesequence = $ARGV[1];

# Length and overlap of the window to scan the sequence
my $lengthwindow = $ARGV[2];  
my $overlap = $ARGV[3];  


# open the first file: table of codon usage frequencies

if (!open(PROPCODONS,"< $filecodontable")) {
  print "dnaloglkh.pl: the file $filecodontable can not be opened\n";
  exit(1);
}

# load the frequencies of codon usage into the hash table %pcodons
# this hash is indexed by a triplet of nucleotides or codon

my %pcodons;
my ($codon,$freq);
my $line;

# for each line in the file (tableofcodons) do

while ($line = <PROPCODONS>) {

        # extract the two values or columns with a regular expression
        # A group of letters and a decimal number

        ($codon,$freq) = ($line =~ m/(\w+) (\d+\.\d+)/);

        # load the hash table
        $pcodons{$codon}=$freq;
}

close(PROPCODONS);

##########################################################
# open the DNA sequence (second file)

if (!open(SEQUENCE,"< $filesequence")) {
        print "dnaloglkh.pl: the file $filesequence can not be opened\n";
        exit(1);
}

# FASTA format:
# >header containing a description of the sequence
# AGACTGACTTAC
# AGACTGACTTAC
# AGACTGAC

my $iden = <SEQUENCE>;  # reading the header of the sequence
my @seql = <SEQUENCE>;  # load the complete sequence into an array
my $seq  = "";          # use a var to save the whole sequence inside

close(SEQUENCE);

# concat every segment (line) in the array into the var $seq

my $i=0;

while ($i < scalar(@seql)) {
        chomp($seql[$i]);
        $seq = $seq . $seql[$i];
        $i = $i + 1;
}

$seq = "\U$seq";

# Scan the three possible frames for every window

$i = 0;
while ($i <= length($seq) - $lengthwindow) {

  # extract the segment of sequence corresponding to that window
  # the function substr(s,p,l) extract a subsequence of "s" from
  # the position "p" having "l" symbols

  my $seqwindow = substr($seq,$i,$lengthwindow);

  # compute the log-likelihood ratio for every reading frame

  my $frame0 = compute_loglkh($seqwindow,0);
  my $frame1 = compute_loglkh($seqwindow,1);
  my $frame2 = compute_loglkh($seqwindow,2);
  
  # take the maximum value. Display the value and the position

  my $maximum = max($frame0,$frame1,$frame2);

  printf "%d %.5f\n",$i,$maximum;

   # increase $i with the $overlap value

  $i = $i + $overlap;

}


#printf "%.2f\n",compute_loglkh($seq,$ARGV[2]);


##############################################################
##############################################################
#############            FUNCTIONS        ####################
##############################################################
##############################################################

# NAME: compute_loglkh
# GOAL: compute the log-likelihood of a sequence using
#       given reading frame
# PARAMS: first  - DNA sequence
#         second - reading frame =  {0,1,2}
# RETURNS: the log-likelihood ratio (coding vs non-coding region)

sub compute_loglkh {
  my $seq   = $_[0];
  my $frame = $_[1];

  $seq = substr($seq,$frame);

  # we are going to use the array @codons to store all of the codons in
  # the input DNA sequence. To split the sequence in segments of length
  # three, a regular expression will be used searching for all of the
  # possible groups of three symbols (option g)
  
  my @codons = ($seq =~ m/.../g);

  # the likelihood ratio is for a current triplet, the ratio between
  # the probability computed from the table of codon frequencies divided
  # by the random probability (uniform), that is, 1/64. After this,
  # the log must be applied to obtain the log-likelihood ratio.
  # It is recommended the use of logs to manage calculations involving
  # small numbers between 0 and 1. Log of the product is equal to the
  # the sum of logs and the log of the division is the substraction of logs

  my $r = 0.0;
  my $i=0;

  while ($i < scalar(@codons)) {

    $r = $r + log($pcodons{$codons[$i]}) - log(1/64);

    $i = $i + 1;

  }

  return $r;
}

# NAME: max
# GOAL: compute the maximum value in a list of N values
# PARAMS: a list of values
# RETURNS: the maximum value in the list

sub max {

  my $i = 1;
  my $max = $_[0];

  while ($i < scalar(@_)) {

    if ($_[$i] > $max) {
      $max = $_[$i];
    }

    $i = $i + 1;
  }

  return $max;
}

