#!@WHICHPERL@ -w
#
# FILE: sites2meme
# AUTHOR: James Johnson
# CREATE DATE: 25/3/2014
# DESCRIPTION: Convert files containing motif sites into MEME motifs
# 

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use MotifUtils qw(seq_to_intern intern_to_meme load_alphabet read_background_file);

use Fcntl qw(O_RDONLY);
use File::Spec::Functions qw(catfile);
use Getopt::Long;
use Pod::Usage;

=head1 NAME

sites2meme - Converts files containing motif sites into MEME motifs

=head1 SYNOPSIS

sites2meme [options] <directory>

 Options:
  -ext <input files extension>  the file extension (with '.') of the sites files;
                                 the file name minus the extension will be
                                 used as the motif identifer;
                                 default: expect an extension of ".txt"

  -map <id mapping file>        tab separated file containing id, name pairs.

  -protein                      Sets the expected alphabet to protein;
                                 default: the expected alphabet is DNA

  -alph <alphabet file>         Set the expected alphabet to the defintion
                                 in the file; default: DNA

  -bg <background file>         file with background frequencies of letters; 
                                 default: use uniform background

  -pseudo <total pseudocounts>  add <total pseudocounts> times letter
                                background to each frequency; default: 0

  -logodds                      print log-odds matrix, too;
                                default: print frequency matrix only

  -url <website>                website for the motif; The motif id is
                                 substituted for MOTIF_NAME

 Each file in the directory is assumed to match the pattern ID.txt where ID is
 the motif identifier. Each file should contain a newline separated list of sites.

 Writes to standard output.

=cut

# globals
my ($bg, $pseudo_total, $url_pattern, $print_logodds, $count, $num_bad);

sub main {
  # set global defaults
  $bg = undef; # loaded below
  $pseudo_total = 0;
  $url_pattern = "";
  $print_logodds = 0; # FALSE
  $count = 0;
  $num_bad = 0;

  # Set option defaults (excluding globals)
  my $is_dna = 1; # default
  my $is_rna = 0; 
  my $is_protein = 0; 
  my $sites_ext = ".txt";
  my $map_file = undef;
  my $bg_file = undef;
  my $alph_file = undef;

  # process the arguments list
  GetOptions(
    "dna" => \$is_dna, "rna" => \$is_rna, "protein" => \$is_protein,
    "ext=s" => \$sites_ext,
    "map=s" => \$map_file, 
    "bg=s" => \$bg_file,
    "pseudo=f" => \$pseudo_total,
    "url=s" => \$url_pattern,
    "logodds" => \$print_logodds,
    "alph=s" => \$alph_file
    #"protein" => sub {$is_dna = 0;}
  ) or pod2usage(2);
  pod2usage("A directory containing site motifs must be specified for the conversion.") unless @ARGV;
  my ($motif_dir) = @ARGV;

  # DNA is default alphabet
  my $alpha = ($is_protein ? "PROTEIN" : ($is_rna ? "RNA" : "DNA"));

  # read the background file
  my %bg_val = &read_background_file(&load_alphabet($alpha, $alph_file), $bg_file);
  $bg = \%bg_val;

  # read the identifier mappings
  my $map = &read_map($map_file);

  # read the specified directory
  my ($motif_dh, $motif_file);
  opendir($motif_dh, $motif_dir) || die("Cannot open directory: $!");
  my @motif_files = sort { $a cmp $b } readdir($motif_dh);
  closedir($motif_dh);
  while ($motif_file = shift @motif_files) {
    # check if the file has the required extension
    if (substr($motif_file, -length($sites_ext)) eq $sites_ext) {
      # Read the motif information
      my $id = substr($motif_file, 0, length($motif_file) - length($sites_ext));
      my $alt = $map->{$id};
      my $sites = &read_sites(catfile($motif_dir, $motif_file));
      # write out the MEME motif
      &write_motif($id, $alt, $sites);
    }
  }

  print(STDERR "Converted $count motifs.\n");
  print(STDERR "$num_bad conversion errors.\n");
}

sub read_sites {
  my ($motif_file) = @_;
  local $/ = undef;
  my $motif_fh;
  sysopen($motif_fh, $motif_file, O_RDONLY) || die("Can't open $motif_file.\n");
  my $sites = <$motif_fh>;
  close($motif_fh);
  return $sites;
}

sub read_map {
  my ($map_file) = @_;
  my %map = ();
  if (defined($map_file)) {
    my ($map_fh, $line);
    sysopen($map_fh, $map_file, O_RDONLY) || die("Can't open $map_file.\n");
    while ($line = <$map_fh>) {
      if ($line =~ m/^\s*(\S+)\s+(\S+)\s*$/) {
        $map{$1} = $2;
      }
    }
    close($map_fh);
  }
  return \%map;
}

sub write_motif {
  my ($id, $alt, $sites) = @_;
  my $url = $url_pattern; 
  $url =~ s/MOTIF_NAME/$id/g;
  my ($motif, $errors) = seq_to_intern($bg, $sites, 1, $pseudo_total,
    id => $id, alt => $alt, url => $url, also => '-');
  # display errors
  if (@{$errors}) {
    print STDERR "Motif ", $id, " Errors:\n", join("\n", @{$errors}), "\n";
  }
  # output motif
  if ($motif && scalar(@{$errors}) == 0) {
    print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++));
  } else {
    $num_bad++;
  }
}

main();
1;

