#!/usr/bin/perl -w use strict; # if the number of input arguments is lower than 4 # return a message showing the error if (scalar(@ARGV) < 3) { print "dnaloglkh.pl codontable DNAsequence frame\n"; exit(1); } # Load the arguments into four vars my $filecodontable = $ARGV[0]; my $filesequence = $ARGV[1]; # Length and overlap of the window to scan the sequence my $lengthwindow = $ARGV[2]; my $overlap = $ARGV[3]; # open the first file: table of codon usage frequencies if (!open(PROPCODONS,"< $filecodontable")) { print "dnaloglkh.pl: the file $filecodontable can not be opened\n"; exit(1); } # load the frequencies of codon usage into the hash table %pcodons # this hash is indexed by a triplet of nucleotides or codon my %pcodons; my ($codon,$freq); my $line; # for each line in the file (tableofcodons) do while ($line = ) { # extract the two values or columns with a regular expression # A group of letters and a decimal number ($codon,$freq) = ($line =~ m/(\w+) (\d+\.\d+)/); # load the hash table $pcodons{$codon}=$freq; } close(PROPCODONS); ########################################################## # open the DNA sequence (second file) if (!open(SEQUENCE,"< $filesequence")) { print "dnaloglkh.pl: the file $filesequence can not be opened\n"; exit(1); } # FASTA format: # >header containing a description of the sequence # AGACTGACTTAC # AGACTGACTTAC # AGACTGAC my $iden = ; # reading the header of the sequence my @seql = ; # load the complete sequence into an array my $seq = ""; # use a var to save the whole sequence inside close(SEQUENCE); # concat every segment (line) in the array into the var $seq my $i=0; while ($i < scalar(@seql)) { chomp($seql[$i]); $seq = $seq . $seql[$i]; $i = $i + 1; } $seq = "\U$seq"; # Scan the three possible frames for every window $i = 0; while ($i <= length($seq) - $lengthwindow) { # extract the segment of sequence corresponding to that window # the function substr(s,p,l) extract a subsequence of "s" from # the position "p" having "l" symbols my $seqwindow = substr($seq,$i,$lengthwindow); # compute the log-likelihood ratio for every reading frame my $frame0 = compute_loglkh($seqwindow,0); my $frame1 = compute_loglkh($seqwindow,1); my $frame2 = compute_loglkh($seqwindow,2); # take the maximum value. Display the value and the position my $maximum = max($frame0,$frame1,$frame2); printf "%d %.5f\n",$i,$maximum; # increase $i with the $overlap value $i = $i + $overlap; } #printf "%.2f\n",compute_loglkh($seq,$ARGV[2]); ############################################################## ############################################################## ############# FUNCTIONS #################### ############################################################## ############################################################## # NAME: compute_loglkh # GOAL: compute the log-likelihood of a sequence using # given reading frame # PARAMS: first - DNA sequence # second - reading frame = {0,1,2} # RETURNS: the log-likelihood ratio (coding vs non-coding region) sub compute_loglkh { my $seq = $_[0]; my $frame = $_[1]; $seq = substr($seq,$frame); # we are going to use the array @codons to store all of the codons in # the input DNA sequence. To split the sequence in segments of length # three, a regular expression will be used searching for all of the # possible groups of three symbols (option g) my @codons = ($seq =~ m/.../g); # the likelihood ratio is for a current triplet, the ratio between # the probability computed from the table of codon frequencies divided # by the random probability (uniform), that is, 1/64. After this, # the log must be applied to obtain the log-likelihood ratio. # It is recommended the use of logs to manage calculations involving # small numbers between 0 and 1. Log of the product is equal to the # the sum of logs and the log of the division is the substraction of logs my $r = 0.0; my $i=0; while ($i < scalar(@codons)) { $r = $r + log($pcodons{$codons[$i]}) - log(1/64); $i = $i + 1; } return $r; } # NAME: max # GOAL: compute the maximum value in a list of N values # PARAMS: a list of values # RETURNS: the maximum value in the list sub max { my $i = 1; my $max = $_[0]; while ($i < scalar(@_)) { if ($_[$i] > $max) { $max = $_[$i]; } $i = $i + 1; } return $max; }