#!@WHICHPERL@ -w
#
# FILE: nmica2meme
# AUTHOR: Timothy L. Bailey
# CREATE DATE: 16/10/2010
# DESCRIPTION: Convert a file containing list of TFBS motif matrices from 
# nestedMICA (BioTiffin/XMS) format to MEME output format. 

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use Alphabet qw(dna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;
use XML::Simple;
use Data::Dumper;

=head1 SYNOPSIS

nmica2meme [options]

 Options: 
  -skip <ID>                    skip this ID (may be repeated)
  -numseqs <numseqs>            assume frequencies based on this many
                                sequences; default: 20
  -truncate_names               truncate motif names at first underscore
  -numbers                      use numbers instead of strings as motif names
  -bg <background file>         file with background frequencies of letters;
                                default: uniform background
  -pseudo <total pseudocounts>  add <total pseudocounts> times letter
                                background to each frequency; default: 0
  -logodds                      print log-odds matrix, too;
                                default: print frequency matrix only
  -url <website>                website for the motif; The untruncated ID is
                                substituted for MOTIF_NAME; default: no url
  -h                            print usage message

  Read a nestedMICA (BioTiffin/XMS) matrix file and convert to MEME format.
  Note only DNA motifs are supported.

  Reads standard input.
  Writes standard output

=cut

# Set option defaults
my %skips;
my $num_seqs = 20;
my $truncate_names = 0;
my $use_numbers = 0;
my $bg_file;
my $pseudo_total = 0; # default total pseudocounts
my $print_logodds = 0;
my $url_pattern = '';
my $help;
# XMS base names
my @bases = ('adenine', 'cytosine', 'guanine', 'thymine');

GetOptions("skip=s" => sub {$skips{$_[1]} = 1}, "numseqs=i" => \$num_seqs,
  "truncate_names" => \$truncate_names, "numbers" => \$use_numbers, 
  "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds,
  "url=s" => \$url_pattern, "h" => \$help) or pod2usage(2);
#check num seqs
pod2usage("Option -numseqs must have a positive value.") if ($num_seqs < 0);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

# output help if requested
pod2usage(1) if $help;

# Get the background model.
my %bg = &read_background_file(&dna(), $bg_file); #DNA background file

# Read the input file.
my $xml = new XML::Simple(KeepRoot => 1, ForceArray => ['motif', 'column'], 
    KeyAttr => {'motif' => 'name', 'weight' => 'symbol'}, 
    ContentKey => '-content');
my $data;
eval {
  $data = $xml->XMLin('-');		# read from standard input
};
if ($@) {
  pod2usage("Bad file on standard input.");
}
#print STDERR Dumper($data);

$data = $data->{'motifset'} if (defined($data->{motifset}));
my $motifs = $data->{motif};


my $n = @bases;
my $num_motifs = 0;
my $num_skipped = 0;
foreach my $motif_name (sort keys %{$motifs}) {
  my $weight_matrix = $motifs->{$motif_name}->{weightmatrix};
  if ($weight_matrix->{'alphabet'} ne 'DNA') {
    print(STDERR "Skipped $motif_name as only DNA motifs are supported.\n");
    $num_skipped++;
    next;
  }
  my $w = $weight_matrix->{columns};
  my $cols = $weight_matrix->{column};
  my $matrix = '';
  # loop over columns of motif
  foreach my $col (@{$cols}) {
    # get the frequencies for this column
    my $freqs = $col->{weight};
    # create an ascii matrix
    for (my $i = 0; $i < scalar(@bases); $i++) {
      $matrix .= " " . $freqs->{$bases[$i]}; 
    }
    $matrix .= "\n";
  }
  # convert and print ascii matrix
  my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', $num_seqs, 
      $pseudo_total, id => $motif_name);
  print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
  print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
}


print(STDERR "Converted $num_motifs motifs.\n");
print(STDERR "Skipped $num_skipped motifs.\n");
