#!@WHICHPERL@ -w

# AUTHOR: James Johnson
# CREATE DATE: 18/9/2014
# DESCRIPTION: Convert regular expression motifs in ELM format

use warnings;
use strict;


use lib qw(@PERLLIBDIR@);

use Alphabet qw(protein);
use MotifUtils qw(seq_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;

=head1 NAME

elm2meme - Converts ELM motifs into MEME motifs.

=head1 SYNOPSIS

elm2meme [options] <ELM file>+

 Options: 
  -anchored			allow anchored motifs; 
				default: skip anchored motifs
  -bg <background file>         file with background frequencies of letters; 
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter 
                                background to each frequency; default: 0
  -logodds                      print log-odds matrix, too; 
                                default: print frequency matrix only
  -url <website>                website for the motif; The motif name
                                is substituted for MOTIF_NAME;
  -h                            print usage message

 Converts ELM motifs into MEME motifs

 Writes to standard output.

=cut

# Set option defaults
my $help = 0;
my $anchored = 0;
my $bg_file;
my $pseudo_total = 0;
my $print_logodds = 0;
my $url_pattern = "";
my @files = ();

GetOptions(
  "anchored" => \$anchored, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total,
  "logodds" => \$print_logodds, "url=s" => \$url_pattern, "help|?" => \$help) or pod2usage(2);
#output help if requested
pod2usage(1) if $help;
@files = @ARGV;
pod2usage("Missing ELM files") unless @files;

# get the background model
my %bg = &read_background_file(&protein(), $bg_file);

my $num_skipped = 0;
my $num_failed = 0;
my $num_motifs = 0;
foreach my $file (@files) {
  my $fh;
  sysopen($fh, $file, O_RDONLY);
  my $line;
  while ($line = <$fh>) {
    next if $line =~ m/^#/; #skip comments
    next if $line =~ m/^\"Accession\"/; #skip field names
    my @parts = split /"\t"/, $line;
    if (@parts) {
      $parts[0] =~ s/^"//;
      $parts[scalar(@parts) -1] =~ s/"$//;
    }
    #Accession ELMIdentifier Description Regex Probability #Instances #Instances_in_PDB
    my ($acc, $elmid, $func_site_name, $desc, $regex, $prob, $nsites, $inst_in_pdb) = @parts;

    # handle motifs that must be anchored? 
    if ($anchored) { $regex =~ s/^\^|\$$//; }
    # can not handle motifs that are of variable length
    if ($regex =~ m/(^|[^\[])\^/ || $regex =~ m/[\$\{\}\?\|\*\+]/) {
      $num_skipped++;
      printf STDERR "Skipping %15s %25s     %s\n", $acc, $elmid, $regex;
      next;
    }
    # remove unneeded brackets - 
    # as we've already checked for length quantifiers, ORs and anchors these
    # can't really be doing anything
    $regex =~ s/[\(\)]//g;

    $nsites = 1 if $nsites <= 0;

    my $url = $url_pattern;
    $url =~ s/MOTIF_NAME/$acc/g;

    my ($motif, $errors) = seq_to_intern(\%bg, $regex, $nsites, 
        $pseudo_total, url => $url, id => $acc, alt => $elmid, also => '.');

    print STDERR join("\n", @{$errors}), "\n" if @{$errors};
    print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
    $num_failed++ unless $motif;
  }
  close($fh);
}
print STDERR "Skipped: $num_skipped, Failed: $num_failed, Converted: $num_motifs\n";
