#!@WHICHPERL@ -w
#
# FILE: taipale2meme
# AUTHOR: James Johnson
# CREATE DATE: 19/10/2010
# DESCRIPTION: Process tab separated files exported from spreadsheets (xls files).
# 

use warnings;
use strict;

use lib qw(@PERLLIBDIR@);

use Alphabet qw(dna);
use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file);

use Getopt::Long;
use Pod::Usage;


=head1 NAME

taipale2meme - Process tab separated value files that have been exported from spreadsheets to meme motifs. 

=head1 SYNOPSIS

taipale2meme [options]

 Options:
  [-nc <c>]*              file columns from which to create motif names 
                          by joining with "_"; default: first non-empty column
  [-oc <c>]*              omit PWM if this column not empty
  -postfix <append>       text to append to motif names.
  -strands 1|2            print '+ -' or '+' on the MEME strand line;
                          default: 2 (prints '+ -')
  -bg <background file>   file with background frequencies of letters; 
                          default: uniform
  -pseudo <A>             add <A> times background frequency to
                          each count when computing letter frequencies
                          default: 0
  -logodds                print log-odds matrix as well as frequency matrix;
                          default: frequency matrix only
  -url <website>          website for the motif if it doesn't have one 
                          already; The motif name is substituted for 
                          MOTIF_NAME; default: use existing url
  -h                      print usage message

 Reads standard input.
 Writes standard output.

=cut

# Constants
my $sites = 20;

# set option defaults
my @name_cols;
my @omit_cols;
my $postfix = "";
my $strands = 2;
my $bg_file;
my $pseudo_total = 0;
my $print_logodds = 0;
my $url_pattern = "";
my $help = 0;


GetOptions(
  "nc=i" => \@name_cols,
  "oc=i" => \@omit_cols,
  "postfix=s" => \$postfix, 
  "strands=i" => \$strands, 
  "bg=s" => \$bg_file, 
  "pseudo=f" => \$pseudo_total, 
  "logodds" => \$print_logodds, 
  "url=s" => \$url_pattern, 
  "help|?" => \$help) or pod2usage(2);

#check strands
pod2usage("Option -strands must be either 1 or 2.") unless ($strands == 1 || $strands == 2);
#check pseudo total
pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0);

pod2usage(1) if $help;

#printf STDERR "%s\n", join(" ", @name_cols);

my @l5l = ("", "", "", "", "");
my $matchA = qr/^(["']?)[aA]\1\t/;
my $matchC = qr/^(["']?)[cC]\1\t/;
my $matchG = qr/^(["']?)[gG]\1\t/;
my $matchT = qr/^(["']?)[tT]\1\t/;


# get the background model
my %bg = &read_background_file(&dna(), $bg_file);

my %dup_check = ();
my %base_name = ();
my %matrices = ();

while (<>) {
  chomp;
  # skip blank lines
  next if (/^\s*$/);
  #update the last 5 lines
  push(@l5l, $_);
  shift(@l5l);

  #look for A, C G and T possibly wrapped with " or ' at the start of the last 4 lines
  if ($l5l[1] =~ $matchA && $l5l[2] =~ $matchC && $l5l[3] =~ $matchG && $l5l[4] =~ $matchT) {
    # try to extract the name
    my $name = "";
    if (scalar(@name_cols)==0) {
      # name is first non-empty column
      foreach (split(/\t/,$l5l[0])) {
        if ($_ =~ m/^(["']?)(.+)\1$/) {
          $name = $2;
          last;
        }
      } 
    } else {
      my $col = 0;
      $name = "";
      my @fields = split(/\t/,$l5l[0]);
      foreach my $col (@name_cols) {
        $_ = $fields[$col-1];
        if ($_ =~ m/^\s*(["']?)(.+)\1\s*$/) {
          $name .= $name ? "_$2" : $2;
          $name =~ s/ /_/g;		# replace spaces with underscores
        }
      }
    }
    die("Missing motif name!\n") if ($name eq "");
    $name .= $postfix;
 
    # Add _n to names (starting with _1) and save the base name for convenience
    my $base = $name;
    $dup_check{$base} = $dup_check{$base} ? $dup_check{$base}+1 : 1;
    $name .= "_" . $dup_check{$base};
    $base_name{$name} = $base;

    # check that "omit" fields are empty
    my $keep = 1;
    foreach my $col (@omit_cols) {
      my @fields = split(/\t/,$l5l[0]);
      if ($fields[$col-1] ne "") {
        $keep = 0;
        last;
      }
    }
    
    # save the PWM matrix indexed by name
    if ($keep) { 
      # extract the PSPM and convert it
      my $matrix = "";
      for (my $i = 1; $i < 5; $i++) {
        my $line = $l5l[$i];
        $line =~ s/^(["']?)[aAcCgGtT]\1\t//;
        $matrix .= $line . "\n";
      }
      $matrices{$name} = $matrix;
    }

    #ensure no accidental reuse of data by clearing the cache
    @l5l = ("", "", "", "", "");
  }
}

# Output the motifs only adding _n if more than 1 motif with same base_name
my $num_motifs = 0;
my $name;
foreach $name (sort(keys %matrices)) {
  my $matrix = $matrices{$name};
  # Strip off the _1 from the name if there is only one motif with the base_name.
  my $base = $base_name{$name};
  $name = $base_name{$name} if ($dup_check{$base} == 1);
  my $url = $url_pattern;
  $url =~ s/MOTIF_NAME/$name/g;
  my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $sites,
  $pseudo_total, rescale => 1, id => $name, url => $url);
  print STDERR join("\n", @{$errors}), "\n" if @{$errors};
  print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif;
}
