#!@WHICHPERL@ -w
#
# FILE: beeml2meme
# AUTHOR: Timothy L. Bailey
# CREATE DATE: Oct-23-2011
# DESCRIPTION: Convert a file containing list of TFBS motif matrices from BEEML
# to MEME output format.  BEEML format has the BEEML ID followed by the UNIPROBE ID.

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use Alphabet qw(dna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;

=head1 SYNOPSIS

beeml2meme [options] <matrix file>+

 Options:
  -bg <background file>         file with background frequencies of letters
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter
                                background to each frequency; default: 0
  -sg <uniprobe screen grab>     specifify a file containing the contents of: 
                                  http://the_brain.bwh.harvard.edu/uniprobe/browse.php
                                  and use the uniprobe ID as the alternate name
  -logodds                      print log-odds matrix, too;
                                default: print frequency matrix only
  -url <website>                website for the motif; The UNIPROBE ID is
                                substituted for MOTIF_NAME; default: no url
=cut

# Set option defaults
my $bg_file;
my $uniprobe_scrape;
my $pseudo_total = 0;
my $print_logodds = 0;
my $url_pattern = "";
my @matrix_files;

GetOptions("bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total,
  "logodds" => \$print_logodds, "sg=s" => \$uniprobe_scrape,
  "url=s" => \$url_pattern) or pod2usage(2);
#check matrix file
pod2usage("A matrix file must be specified for the conversion.") unless @ARGV;
(@matrix_files) = sort(@ARGV);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

# read the background file
my %bg = &read_background_file(&dna(), $bg_file);

# load UNIPROBE IDs from scrape of http://the_brain.bwh.harvard.edu/uniprobe/browse.php
my %id_index;
if (defined($uniprobe_scrape)) {
  %id_index = ();
  my $sg;
  sysopen($sg, $uniprobe_scrape, O_RDONLY) || die ("Can't open $uniprobe_scrape.\n");
  while (<$sg>) {
    chomp;
    next if (/^#/ || /^\s*$/); # skip comment, blank lines
    my @words = split (/\t/);
    my $protein = $words[0];
    $protein =~ s/\s+$//;
    my $uniprobe_id = $words[1];
    $uniprobe_id =~ s/\s+$//;
    my $article = $words[5];
    # translate to BEEML names
    if ($article eq "Berger et al., Nat Biotech 2006 ") {
      $article = "nbt06";
    } elsif ($article eq "Berger et al., Cell 2008 ") {
      $article = "cell08";
    } elsif ($article eq "De Silva et al., PNAS 2008 ") {
      $article = "pnas08";
    } elsif ($article eq "Pompeani et al., Mol Microbiol 2008 ") {
      $article = "mmb08";
    } elsif ($article eq "Badis et al., Science 2009 ") {
      $article = "sci09";
    } elsif ($article eq "Grove et al., Cell 2009 ") {
      $article = "cell09";
    } elsif ($article eq "Zhu et al., Genome Res 2009 ") {
      $article = "gr09";
    } elsif ($article eq "Lesch et al., Genes Dev 2009 ") {
      $article = "gd09";
    } elsif ($article eq "Scharer et al., Cancer Res 2009 ") {
      $article = "cr09";
    } elsif ($article eq "Wei et al., EMBO J 2010 ") {
      $article = "embo10";
    } else {
      die("Unknown article: $article\n");
    }
    $id_index{$article . $protein} = $uniprobe_id;
  }
  close($sg);
}

# Open the matrix file for reading.
my $count = 0;
my $num_bad = 0;
foreach my $matrix_file (@matrix_files) {
  my $matrix_fp;
  sysopen($matrix_fp, $matrix_file, O_RDONLY) || die("Can't open $matrix_file.\n");

  # Read the input file.
  my $line_number = 0;
  my %motifs;
  my $line;
  while ($line = <$matrix_fp>) {
    $line_number++;
    #check the line is an id line
    my $id;
    my $alt;
    my $uniprobe;
    if ($line =~ m/^# ([^,\s]+)(?:\s+([^,\s]+))?/) {
      $id = $1;
      $alt = $2;
      if (! defined $alt) {
        if (%id_index) {
          $id =~ /([^_]+)_([^_]+)(_\w*)?/;
          my $article = $1;
          my $protein1 = $2;
          if ($article =~ /\.v\d+$/) { # remove end
            $article = substr($article, 0, $-[0]);
          }
          $protein1 =~ /^([^-]+)/; # in case there is a stupid "-" in the name
          my $protein2 = $1;
          my $uniprobe_id = $id_index{$article . $protein1};
          $uniprobe_id = $id_index{$article . $protein2} unless defined $uniprobe_id;
          if (defined($uniprobe_id)) {
            $alt = $uniprobe_id;
            $uniprobe = $uniprobe_id;
            $uniprobe =~ s/^UP//;
          } else {
            printf STDERR "Can't find UniPROBE ID for article $article and protein $protein1\n";
          }
        }
      } else {
        $uniprobe = $alt;
        $uniprobe =~ s/^UP//;
      }
    } else {
      die ("No matrix id found at line $line_number in file $matrix_file.\n");
    }
    # Read the motif energy terms (RT units). One line for each nucleotide,
    # and convert to relative probabilities using exp(-x).
    my %rows = ();
    for(my $base_index = 0; $base_index < 4; $base_index++) {
      $line = <$matrix_fp>;
      $line_number++;
      die ("Missing motif counts at line $line_number in file $matrix_file.\n") unless (defined $line);
      chomp($line);
      if ($line =~ m/^([ACGT])(.*)$/i) {
        my $base = uc($1);
        $_ = $2;
        my $row = join(" ", map {2*exp(-$_)} split);      # convert energy to relative probability
        $rows{$base} = $row;
      } else {
        die("Couldn't find base name at start of line.");
      }
    }
    # make a matrix I can convert into a motif
    my $matrix = $rows{A} . "\n" . $rows{C} . "\n" . $rows{G} . "\n" . $rows{T};
    # make a url
    my $url;
    if ($uniprobe) {
      $url = $url_pattern;
      $url =~ s/MOTIF_NAME/$uniprobe/g;
    }
    # set the names
    my $motif_name = $id;
    # try to convert the motif
    my $nsites = 20;
    my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $nsites, 
      $pseudo_total, id => $motif_name, alt => $alt, url => $url, rescale => 1);
    # display errors
    print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
    # output motif
    if ($motif) {
      print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++));
    } else {
      $num_bad++;
    }
  }
  close($matrix_fp);
}

print(STDERR "Converted $count motifs.\n");
print(STDERR "$num_bad conversion errors.\n");
