#!@WHICHPERL@
=head1 NAME

meme_xml_to_html - Make a MEME HTML output from a MEME XML output.

=head1 SYNOPSIS

meme_xml_to_html <MEME XML file> <MEME HTML file>
=cut

use strict;
use warnings;

use Cwd qw(abs_path);
use Fcntl qw(O_RDONLY SEEK_SET);
use File::Basename qw(fileparse);
use File::Spec::Functions qw(tmpdir);
use File::Temp qw(tempfile);
use Getopt::Long;
use Pod::Usage;

use lib '@PERLLIBDIR@';

my $etc_dir;
my $temp_dir;
my $scripts_dir;

#
# initialise the global constants
# 
sub initialise {
  # setup etc dir
  $etc_dir = defined($ENV{MEME_DATA_DIR}) ? $ENV{MEME_DATA_DIR} : '@APPCONFIGDIR@';
  # setup temporary directory
  $temp_dir = '@TMP_DIR@';
  # use the perl default if none is supplied or the replace fails
  $temp_dir = tmpdir() if ($temp_dir eq '' || $temp_dir =~ m/^\@TMP[_]DIR\@$/);

  # find the location of the script
  my $script_name;
  ($script_name, $scripts_dir) = fileparse(__FILE__);
  $scripts_dir = abs_path($scripts_dir);

  # add script location to search path
  unshift(@INC, $scripts_dir);

  require HtmlMonolithWr;
  require MemeSAX;
}

sub arguments {
  # Set Option Defaults
  my $options = {XML_PATH => undef, HTML_PATH => undef};
  # General Options
  my $help = 0; # FALSE
  my @errors = ();
  my @dbs = ();

  # get the options from the arguments
  my $options_success = 0; # FALSE
  # redirect stderr to a temp file so we can get the error message from GetOptions
  my $olderr;
  my $tmperr = tempfile('GetOptions_XXXXXXXXXX', DIR => $temp_dir, UNLINK => 1);
  open($olderr, ">&STDERR") or die("Can't dup STDERR: $!");
  open(STDERR, '>&', $tmperr) or die("Can't redirect STDERR to temp file: $!");
  # parse options
  $options_success = GetOptions(
    'help|?'          => \$help,
  );
  ($options->{XML_PATH}, $options->{HTML_PATH}) = @ARGV;
  # display help
  pod2usage(1) if $help;
  # reset STDERR
  open(STDERR, ">&", $olderr) or die("Can't reset STDERR: $!");
  # read argument parsing errors
  seek($tmperr, 0, SEEK_SET);
  while (<$tmperr>) {chomp; push(@errors, $_);}
  close($tmperr);
  # check source XML file
  unless (defined($options->{XML_PATH})) {
    push(@errors, "No MEME XML file specified");
  } elsif (not -e $options->{XML_PATH}) {
    push(@errors, "The MEME XML file specified does not exist");
  }
  unless (defined($options->{HTML_PATH})) {
    push(@errors, "No output file specified");
  }
  # print errors
  foreach my $error (@errors) {
    print STDERR $error, "\n";
  }
  pod2usage(2) if @errors;
  # return options
  return $options;
}

# start_meme
sub start_meme {
  my ($info, $major_ver, $minor_ver, $patch_ver, $release) = @_;
  my $wr = $info->{wr};
  $wr->str_prop('program', 'MEME');
  $wr->str_prop('version', "$major_ver.$minor_ver.$patch_ver");
  # meme doesn't include the revision information in the XML
  $wr->str_prop('release', $release);
}
# end_meme
sub end_meme {
  my ($info) = @_;
  my $wr = $info->{wr};
}
# start_training_set
sub start_training_set {
  my ($info, $primary_sequences, $primary_count, $primary_positions, 
   $control_sequences, $control_count, $control_positions) = @_;
  $info->{sequence_db} = {};
  $info->{sequence_db}->{primary_source} = $primary_sequences;
  $info->{sequence_db}->{primary_count} = $primary_count;
  $info->{sequence_db}->{primary_positions} = $primary_positions;
  $info->{sequence_db}->{control_source} = $control_sequences;
  $info->{sequence_db}->{control_count} = $control_count;
  $info->{sequence_db}->{control_positions} = $control_positions;
  $info->{sequence_db}->{temp} = tempfile("seq_db_info_XXXXXX", 
    SUFFIX => '.tsv', DIR => $temp_dir, UNLINK => 1);
  $info->{counter} = 0;
}
# end_training_set
sub end_training_set {
  my ($info) = @_;
}
# handle_alphabet
sub handle_alphabet {
  my ($info, $alphabet) = @_;
  $info->{alph} = $alphabet;
}
# handle_sequence
sub handle_sequence {
  my ($info, $num, $name, $length, $weight) = @_;
  # num is the number extracted out of the id
  # sequence_x where x is from 0 to the number of sequences -1
  # and where x increments with each sequence
  die("Sequence id number is not expected") unless ($info->{counter} == $num);
  # store the sequence information to a temporary file as we don't want to write
  # it out yet.
  my $fh = $info->{sequence_db}->{temp};
  print $fh "$name\t$length\t$weight\n";
  $info->{counter}++;
}
# handle_letter_frequencies
sub handle_letter_frequencies {
  my ($info, $freqs) = @_;
  $info->{sequence_db}->{freqs} = $freqs;
}
# handle_background_frequencies
sub handle_background_frequencies {
  my ($info, $source, $order, $freqs) = @_;
  $info->{bg} = {source => $source, order => $order, freqs => $freqs};
}
# end_model
sub handle_settings {
  my ($info, %settings) = @_;
  my $wr = $info->{wr};
  # determine stopping reason
  my $reason = $settings{reason_for_stopping};
  my $stop_type = "unknown";
  my $num_re = qr/((?:[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)|inf)/;
  my $max_time;
  if ($reason =~ m/Stopped because couldn't find any more starting points for EM\./) {
    $stop_type = "em";
  } elsif ($reason =~ m/Stopped because nmotifs = (\d+) reached\./) {
    $stop_type = "motifs";
  } elsif ($reason =~ m/Stopped because would probably run out of time \($num_re secs\)\./) {
    $stop_type = "time";
    $max_time = $1;
  } elsif ($reason =~ m/Stopped because next motif has fewer than (\d+) sites\./) {
    $stop_type = "sites";
  } elsif ($reason =~ m/Stopped because motif E-value > $num_re\./) {
    $stop_type = "evalue";
  }
  $wr->str_prop("stop_reason", $reason);
  # output command line
  $wr->str_array_prop("cmd", split(/\s+/, $settings{command_line}));
  # output options
  $wr->property("options");
  $wr->start_object_value();
  # BEGIN OPTIONS (note that I am deliberately using the names of the command line options)
  # site distribution
  my %dtype = ('zoops' => 'zoops', 'oops' => 'oops', 'anr' => 'anr', 'tcm' => 'anr');
  $wr->str_prop("mod", $dtype{$settings{type}});
  # reverse complements
  $wr->bool_prop("revcomp", $settings{strands} eq "both");
  # stop conditions
  $wr->num_prop("nmotifs", $settings{nmotifs});
  $wr->num_prop("evt", $settings{evalue_threshold}) unless ($settings{evalue_threshold} =~ m/^inf(inity)?$/i);
  $wr->num_prop("time", $max_time) if (defined($max_time)); # we only get this value when the stop reason is time
  # object function
  $wr->str_prop("objfun", $settings{object_function});
  # starting points function
  $wr->str_prop("spfun", $settings{spfun});
  # motif width
  $wr->num_prop("minw", $settings{min_width});
  $wr->num_prop("maxw", $settings{max_width});
  #sites
  $wr->num_prop("minsites", $settings{minsites});
  $wr->num_prop("maxsites", $settings{maxsites});
  $wr->num_prop("wnsites", $settings{wnsites});
  # selecting starts for EM
  $wr->str_prop("spmap", $settings{spmap});
  $wr->num_prop("spfuzz", $settings{spfuzz});
  $wr->num_prop("searchsize", $settings{searchsize});
  $wr->num_prop("maxsize", $settings{maxsize});
  $wr->str_prop("norand", $settings{norand});
  $wr->num_prop("csites", $settings{csites}) if ($settings{csites}); 
  $wr->num_prop("hsfrac", $settings{hsfrac});
  # EM algorithm
  $wr->str_prop("prior", $settings{prior});
  $wr->str_prop("priors_source", $settings{priors_file}) if ($settings{priors_file} =~ m/\S/);
  $wr->num_prop("b", $settings{beta});
  $wr->num_prop("maxiter", $settings{maxiter});
  $wr->num_prop("distance", $settings{distance});
  # trimming alignment costs
  if ($settings{wg}) {
    $wr->num_prop("wg", $settings{wg});
    $wr->num_prop("ws", $settings{ws});
    $wr->bool_prop("noendgaps", !$settings{endgaps});
  }
  # EM starting point source
  $wr->bool_prop("substring", $settings{substring});
  # Random sampling
  $wr->num_prop("seed", $settings{seed});
  # Display
  $wr->num_prop("brief", $settings{brief});
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # These experimental options made it into the XML but
  # are probably not interesting so I've commented them
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # sampling probability for subsq. starts (removed option, normally 1 but 0 if mfile set)
  # objective function (hidden option, default ev)
  #my %objfun = ('P-value of product of p-values' => 'pv',
  #  'E-value of product of p-values' => 'ev');
  #$wr->str_prop("objfun", $objfun{$settings{object_function}});
  # fraction of sequences to use (hidden option, default 1)
  #$wr->num_prop("seqfrac", $settings{seqfrac});
  # END OPTIONS
  $wr->end_object_value();
  $wr->property("alphabet");
  $info->{alph}->to_json($wr);
  $wr->property("background");
  $wr->start_object_value();
  $wr->str_prop("source", $info->{bg}->{source});
  $wr->num_prop("order", $info->{bg}->{order});
  $wr->num_array_prop("freqs", @{$info->{bg}->{freqs}});
  $wr->end_object_value();

  $wr->property("sequence_db");
  $wr->start_object_value();
  $wr->str_prop("primary_source", $info->{sequence_db}->{primary_source});
  $wr->num_prop("primary_count", $info->{sequence_db}->{primary_count});
  $wr->num_prop("primary_positions", $info->{sequence_db}->{primary_positions});
  $wr->str_prop("control_source", $info->{sequence_db}->{control_source});
  $wr->num_prop("control_count", $info->{sequence_db}->{control_count});
  $wr->num_prop("control_positions", $info->{sequence_db}->{control_positions});
  $wr->str_prop("psp_source", $settings{psp_file}) if ($settings{psp_file} =~ m/\S/);
  $wr->num_array_prop("freqs", @{$info->{sequence_db}->{freqs}});
  if ($info->{counter} > 0) {
    $wr->property("sequences");
    $wr->start_array_value();
    my $fh = $info->{sequence_db}->{temp};
    seek($fh, 0, SEEK_SET);
    while (<$fh>) {
      chomp($_);
      my ($name, $len, $weight) = split(/\t/, $_);
      $wr->start_object_value();
      $wr->str_prop("name", $name);
      $wr->num_prop("length", $len);
      $wr->num_prop("weight", $weight);
      $wr->end_object_value();
    }
    $wr->end_array_value();
    $info->{sequence_db}->{temp} = undef;
    close($fh);
  }
  $wr->end_object_value();
}
# start_background_frequencies
sub start_background_frequencies {
  my ($info, $source, $order) = @_;
  $info->{background_source} = $source;
  $info->{background_order} = $order;
}
# end_background_frequencies
sub end_background_frequencies {
  my ($info) = @_;
}
# start_bf_alphabet_array
sub start_bf_alphabet_array {
  my ($info) = @_;
  my $wr = $info->{wr};
}
# end_bf_alphabet_array
sub end_bf_alphabet_array {
  my ($info) = @_;
  my $wr = $info->{wr};
}
# handle_bf_aa_value
sub handle_bf_aa_value {
  my ($info, $letter_id, $frequency) = @_;
  my $wr = $info->{wr};
  my $letter = $info->{id_to_symbol}->{$letter_id};
  die("Expected non-ambiguous letter") if $letter->{ambig};
  $info->{background}->{$letter->{symbol}} = $frequency;
}
# start_motifs
sub start_motifs {
  my ($info) = @_;
  my $wr = $info->{wr};
  $wr->property("motifs");
  $wr->start_array_value();
}
# end_motifs
sub end_motifs {
  my ($info) = @_;
  my $wr = $info->{wr};
  $wr->end_array_value();
}
# start_motif
sub start_motif {
  my ($info, %motif) = @_;
  my $wr = $info->{wr};
  $wr->start_object_value();
  $wr->num_prop("db", 0);
  $wr->str_prop("id", $motif{NAME});
  $wr->str_prop("alt", $motif{ALT});
  $wr->num_prop("len", $motif{WIDTH});
  $wr->num_prop("nsites", $motif{SITES});
  $wr->str_prop("evalue", $motif{EVALUE});
  $wr->num_prop("ic", $motif{IC}); #information content
  $wr->num_prop("re", $motif{RE}); #relative entropy
  $wr->num_prop("llr", $motif{LLR}); #log likelihood ratio
  $wr->num_prop("bt", $motif{BAYES}); # bayes threshold
  $wr->num_prop("time", $motif{ELAPSED});
  $wr->str_prop("url", $motif{URL}) if $motif{URL};
  my $psm = $motif{PSM};
  $wr->property("psm");
  $wr->start_array_value();
  for (my $i = 0; $i < scalar(@{$psm}); $i++) {
    $wr->num_array_value(@{$psm->[$i]});
  }
  $wr->end_array_value();
  my $pwm = $motif{PWM};
  $wr->property("pwm");
  $wr->start_array_value();
  for (my $i = 0; $i < scalar(@{$pwm}); $i++) {
    $wr->num_array_value(@{$pwm->[$i]});
  }
  $wr->end_array_value();
  $wr->property("sites");
  $wr->start_array_value();
}
sub handle_site {
  my ($info, $seq_num, $pos, $rc, $pvalue, $lflank, $match, $rflank) = @_;
  my $wr = $info->{wr};
  $wr->start_object_value();
  $wr->num_prop("seq", $seq_num);
  $wr->num_prop("pos", $pos);
  $wr->bool_prop("rc", $rc);
  $wr->num_prop("pvalue", $pvalue);
  $wr->str_prop("lflank", $lflank); 
  $wr->str_prop("match", $match);
  $wr->str_prop("rflank", $rflank);
  $wr->end_object_value();
}
# end_motif
sub end_motif {
  my ($info) = @_;
  my $wr = $info->{wr};
  $wr->end_array_value();
  $wr->end_object_value();
}
# start_scanned_sites_summary
sub start_scanned_sites_summary {
  my ($info, $scan_threhold) = @_;
  my $wr = $info->{wr};
  $wr->property("scan");
  $wr->start_array_value();
  $info->{counter} = 0;
}
# end_scanned_sites_summary
sub end_scanned_sites_summary {
  my ($info) = @_;
  my $wr = $info->{wr};
  $wr->end_array_value();
}
# start_scanned_sites
sub start_scanned_sites {
  my ($info, $num, $pvalue, $num_sites) = @_;
  my $wr = $info->{wr};
  die("Sequence id number is not expected") unless ($info->{counter} == $num);
  $wr->start_object_value();
  $wr->num_prop("pvalue", $pvalue);
  $wr->property("sites");
  $wr->start_array_value();
}
# end_scanned_sites
sub end_scanned_sites {
  my ($info) = @_;
  my $wr = $info->{wr};
  $wr->end_array_value();
  $wr->end_object_value();
  $info->{counter}++;
}
# handle_scanned_site
sub handle_scanned_site {
  my ($info, $motif_num, $strand, $pos, $pvalue) = @_;
  my $wr = $info->{wr};
  $wr->start_object_value();
  $wr->num_prop("motif", $motif_num - 1); # convert to index
  $wr->num_prop("pos", $pos); # TODO check is zero based.
  $wr->bool_prop("rc", $strand eq 'minus');
  $wr->num_prop("pvalue", $pvalue);
  $wr->end_object_value();
}

sub transform_data {
  my ($opts, $jsonwr) = @_;
  my $info = {wr => $jsonwr};

  my $sax = new MemeSAX($info, 
    start_meme => \&start_meme,
    start_training_set => \&start_training_set,
    handle_alphabet => \&handle_alphabet,
    handle_sequence => \&handle_sequence,
    end_training_set => \&end_training_set,
    handle_letter_frequencies => \&handle_letter_frequencies,
    handle_background_frequencies => \&handle_background_frequencies,
    handle_settings => \&handle_settings,
    start_motifs => \&start_motifs,
    start_motif => \&start_motif,
    handle_site => \&handle_site,
    end_motif => \&end_motif,
    end_motifs => \&end_motifs,
    start_scanned_sites_summary => \&start_scanned_sites_summary,
    start_scanned_sites => \&start_scanned_sites,
    handle_scanned_site => \&handle_scanned_site,
    end_scanned_sites => \&end_scanned_sites,
    end_scanned_sites_summary => \&end_scanned_sites_summary,
    end_meme => \&end_meme,
  );
  my $fh;
  sysopen($fh, $opts->{XML_PATH}, O_RDONLY) or die("Failed to open file \"$opts->{XML_PATH}\"\n");
  while (<$fh>) {
    $sax->parse_more($_);
    if ($sax->has_errors()) {
      die("Failed to write HTML output due to errors processing the XML:\n" . join("\n", $sax->get_errors()));
    }    
  }
  $sax->parse_done();
  if ($sax->has_errors()) {
    die("Failed to write HTML output due to errors processing the XML:\n" . join("\n", $sax->get_errors()));
  }  
  my @errors = $sax->get_errors();
  foreach my $error (@errors) {
    print $error, "\n";
  }
}

sub main {
  &initialise();
  my $opts = &arguments();
  # start writing HTML
  my $htmlwr = new HtmlMonolithWr($etc_dir, 'meme_template.html', 
    $opts->{HTML_PATH}, 'meme_data.js' => 'data');
  # transform the XML into JSON
  &transform_data($opts, $htmlwr->output());
  # finish writing HTML
  $htmlwr->output();
}

&main();
1;
