#!@WHICHPERL@ -w
# FILE: rna2meme
# AUTHOR: James Johnson
# CREATE DATE: 19/8/2010
# DESCRIPTION: Convert a RNA sequence to its binding motif in MEME format

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use Alphabet qw(dna rna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use List::Util qw(max min);
use Pod::Usage;

=head1 NAME

rna2meme - takes a FASTA file with micro-RNA sequences and creates 
the complementary target motif for each FASTA sequence. 

=head1 SYNOPSIS

rna2meme [options] <filename or '-'>

 Options:
  -rna                          Output RNA motifs (default).
  -dna                          Output DNA motifs instead of RNA motifs.
  -seed_start <offset>          starting offset of seed in RNA sequence,
                                set to 0 to treat entire sequence as seed;
                                default: 0
  -seed_end <offset>            ending offset of seed in RNA sequence;
                                default: 0 
  -start <offset>               starting offset in RNA sequence (inclusive);
                                use negative numbers to count from end;
                                default: 1
  -end <offset>                 ending offset in RNA sequence (inclusive);
                                use negative numbers to count from end;
                                default: -1
  -match <count>                count to assign to a match (complement)
                                in the seed region
                                default: 1
  -wobble <count>               count to assign to a U for a G, or a G for a U
                                in the seed region
                                default: 0.1
  -miss <count>                 count to assign to a non-match non-wobble
                                in the seed region
                                default: 0.01
  -other_count <count>          extra count added to match, wobble 
                                and misses in non-seed positions to reduce
                                their contribution to the score;
                                default: 0.5
  -bg <background file>         file with background frequencies of letters; 
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter 
                                background to each frequency; default: 0
  -logodds                      output the log-odds (PSSM) and frequency 
                                (PSPM) motifs; default: PSPM motif only
  -url <website>                website for the motif; The FASTA ID is 
                                substituted for MOTIF_NAME; The first word
                                after the FASTA ID is substituted for
                                MOTIF_AC; default: no url

  Convert each micro-RNA sequence to its target motif in MEME format.

  Writes standard output.

=cut

# Constants
my $sites = 20;

# Set option defaults
my $is_rna = 1;
my $is_dna = 0;
my $start = 1;
my $end = -1;
my $seed_start = 0;
my $seed_end = 0; 
my $match = 1;
my $wobble = 0.1;
my $miss = 0.01;
my $other_count = 0.5;
my $bg_file;
my $pseudo_total = 0;
my $print_logodds = 0;
my $url_pattern = "";
my $fasta_file;

GetOptions(
  "rna" => \$is_rna,
  "dna" => \$is_dna,
  "seed_start=i" => \$seed_start, "seed_end=i" => \$seed_end,
  "other_count=f" => \$other_count,
  "start=i" => \$start, "end=i" => \$end,
  "match=f" => \$match, 
  "wobble=f" => \$wobble, "miss=f" => \$miss, "bg=s" => \$bg_file, 
  "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, 
  "url=s" => \$url_pattern) or pod2usage(2);
pod2usage("Option -seed_end may not be smaller than -seed_start.") if ($seed_end < $seed_start);
pod2usage("Option -other_count must have a non-negative value.") if ($other_count < 0);
pod2usage("Option -start must not be zero.") if ($start == 0);
pod2usage("Option -end must not be zero.") if ($end == 0);
pod2usage("Option -match must have a non-negative value.") if ($match < 0);
pod2usage("Option -wobble must have a non-negative value.") if ($wobble < 0);
pod2usage("Option -miss must have a non-negative value.") if ($miss < 0);
pod2usage("Option -pseudo must have a non-negative value.") if ($pseudo_total < 0);
pod2usage("A FASTA file must be specified.") unless @ARGV;
$fasta_file = shift(@ARGV);
pod2usage("Only one FASTA file may be specified.") if @ARGV;

# read the background file
$is_rna = $is_dna ? 0 : 1;
my %bg = &read_background_file($is_rna ? &rna() : &dna(), $bg_file);

my $fasta_fp;
if ($fasta_file eq "-") {
  $fasta_fp = *STDIN;
} else { 
  sysopen($fasta_fp, $fasta_file, O_RDONLY);
}

my $line;
my $id = '';
my $acc = '';
my @rest;
my $seq = '';
my $num_motifs = 0;
while ($line = <$fasta_fp>) {
  chomp($line);
  if ($line =~ m/^>(.*)$/) {
    $acc = ${seq} unless (defined $acc);
    process_seq() if $id;
    #$id = $1;
    $_ = $1;
    ($id, $acc, @rest) = split;
    $seq = '';
  } else {
    $seq .= $line;
  }
}
$acc = ${seq} unless (defined $acc);
process_seq() if $id;

close($fasta_fp) unless ($fasta_file eq "-");

# process the RNA sequence
sub process_seq {
  die("Missing sequence\n") unless $seq;
  die("Sequence contains invalid characters\n") unless $seq =~ m/^[ACGTU]+$/;
  my $len = length($seq);
  my $s_index = position_to_index($start, $len);
  my $e_index = position_to_index($end, $len);
  if ($s_index > $e_index) {
    my $temp = $s_index;
    $s_index = $e_index;
    $e_index = $temp;
  }
  my $sub_seq = substr($seq, $s_index, $e_index - $s_index + 1);
  my @letters = reverse(split(//, uc($sub_seq)));
  my $matrix = '';
  $len = @letters;
  my $i_rev = $len;
  foreach my $letter (@letters) {
    my ($match1, $wobble1, $miss1) = ($match, $wobble, $miss);
    unless ($seed_start == 0 || ($i_rev >= $seed_start && $i_rev <= $seed_end)) {
      my $scale = 4 * $other_count;
      $match1 = ($match + $other_count) / $scale;
      $wobble1 = ($wobble + $other_count) / $scale;
      $miss1 = ($miss + $other_count) / $scale;
    }
    if ($letter eq 'A') {
      $matrix .= "$miss1 $miss1 $miss1 $match1\n";
    } elsif ($letter eq 'C') {
      $matrix .= "$miss1 $miss1 $match1 $miss1\n";
    } elsif ($letter eq 'G') {
      $matrix .= "$miss1 $match1 $miss1 $wobble1\n";
    } elsif ($letter eq 'T' || $letter eq 'U') {
      $matrix .= "$match1 $miss1 $wobble1 $miss1\n";
    }
    $i_rev--;
  }
  my $url = $url_pattern;
  $url =~ s/MOTIF_NAME/$id/g;
  $url =~ s/MOTIF_AC/$acc/g;
  my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', $sites, $pseudo_total, id => "$id $acc", url => $url, strands => 1);
  print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
  print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
}

# converts a relative position to an actual index
sub position_to_index {
  my ($pos, $len) = @_;
  if ($pos < 0) {
    return max(0, $len + $pos);
  } else {
    return min($len, $pos) - 1;
  }
}
