#!@WHICHPERL@ -w

#**********************************************************************/
#* Copyright (c) University of Washington,                            */
#* Department of Genome Sciences, 2006. Written by Shobhit Gupta      */
#* and Timothy L. Bailey     					      */
#* All rights reserved.                                               */
#**********************************************************************/ 

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use Alphabet qw(dna rna protein);
use MotifUtils qw(seq_to_intern matrix_to_intern intern_to_meme read_background_file);

use Fcntl qw(O_RDONLY);
use Getopt::Long;
use Pod::Usage;


=head1 NAME

=head1 SYNOPSIS

jaspar2meme [options] <Jaspar directory | PFM bundle>

 Options: 
  -bundle         read a single file containing many JASPAR
                  count matrices in 2014 or 2016 format with 
		  their names.
  -pfm            read JASPAR count files (.pfm); 
                  default: site files (.sites)
  -cm             read count file with line labels 'A|' etc. (.cm);
                  default: site files (.sites)
  -numbers        use numbers instead of strings as motif names
  -strands 1|2    print '+ -' '+' on the MEME strand line;
                  default: 2 (prints '+ -')
  -bg <bfile>     file with background frequencies in MEME
                  -bfile format; default: uniform frequencies
  -pseudo <A>     add <A> times background frequency to
                  each count when computing letter frequencies
                  default: 0
  -logodds        print log-odds matrix as well as frequency matrix;
                  default: frequency matrix only
  -url <website>  website for the motif; The motif name
                  is substituted for MOTIF_NAME;


  Convert a directory of JASPAR files into a MEME version 4 formatted 
  file suitable for use with MAST and other MEME Suite programs.

  A JASPAR '.sites' file describes a motif in terms of a multiple
  alignment of sites.  It contains a multiple alignment in modified 
  FASTA format.  Only capitalized sequence letters are part of the alignment.

  A JASPAR count file ('.pfm') contains a count matrix where the rows
  correspond to A, C, G and T, respectively.  

  A CM count file ('.cm') prefixes the rows with 'A| ', 'C| ', 'G| ' and 'T| '.

  A log-odds matrix and a probability matrix are output for each
  motif ('.sites') file.  The probability matrix is computed using
  pseudo-counts consisting of the background frequency (see -bg, above)
  multiplied by the total pseudocounts (see -pseudo, above).
  The log-odds matrix uses the background frequencies in the denominator
  and is log base 2.

  If a matrix_list.txt file exists and -pfm is given, the JASPAR names of the
  motifs are read from that file and included in the output.
  
  Writes standard output.

=cut

sub output_matrix {

}

sub main {
  # Set option defaults
  my $bundle_mode = 0; #FALSE
  my $ext = "sites";
  my $strands = 2;
  my $use_numbers = 0;
  my $bg_file;
  my $pseudo_total = 0;
  my $print_logodds = 0;
  my $url_pattern = '';

  GetOptions("bundle" => \$bundle_mode, 
    "pfm" => sub {$ext = "pfm"}, "cm" => sub {$ext = "cm"}, 
    "numbers" => \$use_numbers, "strands=i" => \$strands, "bg=s" => \$bg_file, 
    "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, 
    "url=s" => \$url_pattern) or pod2usage(2);
  #check strands
  pod2usage("Option -strands must be either 1 or 2.") unless ($strands == 1 || $strands == 2);
  #check pseudo total
  pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

  # get the background model
  my %bg = &read_background_file(&dna(), $bg_file);
  #my %bg = &read_background_file(&rna(), $bg_file);
  my $alph = $bg{alph};

  my $count = 0; # number of motifs written
  unless ($bundle_mode) {
    #check directory
    pod2usage("A directory must be specified for the conversion.") unless @ARGV;
    my $target_directory = shift(@ARGV);
    pod2usage("Unless converting bundled pfm files then you must specify a directory") unless (-d $target_directory);
    pod2usage("Only one directory may be specified for the conversion.") if @ARGV;

    # change into the jaspar directory
    chdir($target_directory) or die("Failed to change into the Jaspar directory at \"$target_directory\".\n");

    # get the matrix_list.txt file if there is one and
    # get the motif names from it.
    my %motif_names;
    my $fp;
    if ($ext eq "pfm") {
      my $names_file = "matrix_list.txt";
      if (-e $names_file) {
        printf(STDERR "Found $names_file file.  Reading in motif names.\n");
        sysopen($fp, $names_file, O_RDONLY) || die "Can't open file \"$names_file\" in directory \"$target_directory\".\n";
        while (my $line = <$fp>) {
          my @words = split(/\s+/, $line);
          $motif_names{$words[0]} = $words[2];
        }
        close($fp);
      }
    }

    # read in motif files and print motifs
    my @files = glob("*.$ext");
    foreach my $file (@files) {
      # get the id from the file name
      $file =~ m/(\S*)\.$ext/;
      my $jaspar_id = $1;
      my $name = ($use_numbers ? $count + 1 : $jaspar_id);
      my $alt = $motif_names{$jaspar_id};
      $alt .= "($jaspar_id)" if $use_numbers;
      my $url = $url_pattern;
      $url =~ s/MOTIF_NAME/$jaspar_id/g;
      # open the file
      sysopen($fp, $file, O_RDONLY) || die "Can't open file \"$file\" in directory \"$target_directory\"\n";
      # read the file
      my ($motif, $errors);
      if ($ext eq "pfm" || $ext eq "cm") {  # read a counts file (.pfm, .cm)
        my $matrix = '';
        while (my $line = <$fp>) {
          $line =~ s/#.*$//; # delete comments
          $line =~ s/^\s*[A-Za-z]\s*\|// if ($ext eq "cm");# discard line labels
          $matrix .= $line;
        }
        ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', 1, $pseudo_total, 
          strands => $strands, id => $name, alt => $alt, url => $url); #note: given site_count of 1
      } elsif ($ext eq "sites") { # read a sites file
        my $seq = '';
        my $is_site = 0;
        while (my $line = <$fp>) {
          chomp($line);
          if ($is_site) {
            $line =~ s/[a-z]//g; # delete lower case letters
            $seq .= $line . "\n";
            $is_site = 0;
          } elsif ($line =~ m/^\>/) {
            # the line looks like:
            # >MA0001	AGL3	1
            if ($line =~ m/^\>\S+\s+(\S+)\s/ && !defined($alt)) {
              $alt = $1;
              $alt .= "($jaspar_id)" if $use_numbers;
            }
            $is_site = 1;
          }
        }
        ($motif, $errors) = seq_to_intern(\%bg, $seq, 1, $pseudo_total, 
          strands => $strands, id => $name, alt => $alt, url => $url);
      } # read file
      # close the file
      close($fp);
      #print errors
      print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
      if ($motif) {
        # print motif
        print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++));
      } else {
        print STDERR "Conversion of \"$file\" failed.\n" unless $motif;
      }
    } # foreach file
  } else {
    # check bundle file
    pod2usage("A file (or '-' to read from STDIN) must be specified for the conversion.") unless @ARGV;
    my $bundle = shift(@ARGV);
    pod2usage("When in bundle mode you must specify a file, not a directory.") unless (-f $bundle or $bundle eq '-');
    pod2usage("Only one file may be specified for the conversion.") if @ARGV;
    open my $fp, "<$bundle" or die "Can't open file \"$bundle\".\n";
    my ($matrix, $name, $alt, $url);
    # anon sub to print a matrix
    my $print_matrix = sub {
      if ($matrix && $name) {
        my ($motif, $errors) = matrix_to_intern(
          \%bg, $matrix, 'col', 1, $pseudo_total, 
          strands => $strands, id => $name, alt => $alt, url => $url);
        #print errors
        print(STDERR join("\n", @{$errors}), "\n") if @{$errors};
        if ($motif) {
          # print motif
          print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++));
        } else {
          print STDERR "Conversion of motif $name $alt failed.\n" unless $motif;
        }
      }
    };
    # read bundled pfms
    my @lines = ();	# array of lines to allow any input alphabet order
    while (my $line = <$fp>) {
      if ($line =~ m/^\s+$/) {				# blank line
        next;				
      } elsif ($line =~ m/^>(\S+)\s+(\S+)/) {		# ID line
        my $jaspar_id = $1;
        my $jaspar_alt = $2;
        if (@lines) { $matrix = join("", @lines); }
        $print_matrix->();
        $name = ($use_numbers ? $count + 1 : $jaspar_id);
        $alt = $jaspar_alt;
        $alt .= "($jaspar_id)" if $use_numbers;
        $url = $url_pattern;
        $url =~ s/MOTIF_NAME/$jaspar_id/g;
        $matrix = "";
      } elsif ($line =~ m/^\s*(\d+\s+)*(\d+)\s*$/) {	# pre-2016 format
        $matrix .= $line;			# save line
      } elsif (($line =~ m/^\s*(\w)\s*\[[\d\s]+\]\s*$/) && $alph->is_core($1)) { # 2016 format
	my $index = $alph->index($1);		# save letter index
	$line =~ m/\[([\d\s]+)\]/;		# extract counts between brackets
	$lines[$index] = $1 . "\n";		# put line letter's location
      } else {
        print STDERR "Warning: unexpected line skipped \"$line\"";
        next;
      }
    }
    if (@lines) { $matrix = join("", @lines); }
    $print_matrix->();
  }
}

main();
1;
