package MemeSAX;

use base 'CheckingSAX';

use strict;
use warnings;

use CheckingSAX;
use Alphabet;

my $any_re = qr/^(.*)$/;
my $sci_re = qr/[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?/;
my $sci_or_inf_re = qr/(?:$sci_re|inf)/;
my $num_re = qr/^([+-]?$sci_or_inf_re)$/;
my $num_trim_re = qr/^\s*([+-]?$sci_or_inf_re)\s*$/;
my $pos_num_re = qr/^([+]?$sci_or_inf_re)$/;
my $pos_num_trim_re = qr/^\s*([+]?$sci_or_inf_re)\s*$/;
my $int_re = qr/^(\d+)$/;
my $int_trim_re = qr/^\s*(\d+)\s*$/;
my $float_re = qr/^(\d+(?:\.\d+)?)$/;
my $float_trim_re = qr/^\s*(\d+(?:\.\d+)?)\s*$/;
my $trim_re = qr/^\s*(.*?)\s*$/;
my $text_re = qr/^(.*\S.*)$/;
my $nospace_re = qr/^(\S+)$/;
my $sym_re = qr/[A-Za-z0-9\?\.\*\-]/;
my $sym_id_re = qr/^([a-zA-Z]|n[0-9]|x[a-zA-Z0-9]{2})$/;

my $ST_IN_SSITES = [{ELE => 'scanned_site', RPT => $RPT_ANY, ATRS => [
      {ATR => 'motif_id', VAL => qr/^motif_(\d+)$/},
      {ATR => 'strand', VAL => qr/^(plus|minus|none)$/},
      {ATR => 'position', VAL => $int_re},
      {ATR => 'pvalue', VAL => \&_value_pvalue}]}];
my $ST_IN_SSS = [{ELE => 'scanned_sites', RPT => $RPT_ALO, TO => $ST_IN_SSITES, ATRS => [
      {ATR => 'sequence_id', VAL => qr/^sequence_(\d+)$/},
      {ATR => 'pvalue', VAL => \&_value_pvalue},
      {ATR => 'num_sites', VAL => $int_re}]}];
my $ST_IN_MODEL = [
  &_struc_setting('command_line', $trim_re),
  &_struc_setting('host', $trim_re),
  &_struc_setting('type', \&_value_modtype),
  &_struc_setting('nmotifs', $int_trim_re),
  &_struc_setting('evalue_threshold', \&_value_evalue),
  &_struc_setting('object_function', $trim_re), #TODO stricter RE
  &_struc_setting('spfun', $trim_re),
  &_struc_setting('min_width', $int_trim_re),
  &_struc_setting('max_width', $int_trim_re),
  # These optional values were in Tim's experimental "new" MEME branch...
  &_struc_setting('mask_type', $any_re, 1),
  &_struc_setting('min_key', $any_re, 1),
  &_struc_setting('max_key', $any_re, 1),
  &_struc_setting('negfile', $any_re, 1),
  &_struc_setting('kmer', $any_re, 1),
  &_struc_setting('ncopies', $any_re, 1),
  # End of "new" (currently unused) values
  &_struc_setting('wg', $num_trim_re, 1),
  &_struc_setting('ws', $num_trim_re, 1),
  &_struc_setting('endgaps', \&_value_yesno, 1),
  &_struc_setting('substring', \&_value_yesno),
  &_struc_setting('minsites', $int_trim_re),
  &_struc_setting('maxsites', $int_trim_re),
  &_struc_setting('wnsites', $num_trim_re),
  &_struc_setting('spmap', qr/^\s*(pam|uni)\s*$/),
  &_struc_setting('spfuzz', $num_trim_re),
  &_struc_setting('prior', qr/^\s*(addone|dirichlet|dmix|mega|megap)\s*$/),
  &_struc_setting('beta', $num_trim_re),
  &_struc_setting('maxiter', $int_trim_re),
  &_struc_setting('distance', $num_trim_re),
  #&_struc_setting('num_sequences', $int_trim_re),
  &_struc_setting('num_positions', $int_trim_re),
  &_struc_setting('seed', qr/^\s*(\-?\d+)\s*$/),
  &_struc_setting('hsfrac', $num_trim_re),
  &_struc_setting('searchsize', $int_trim_re),
  &_struc_setting('maxsize', $int_trim_re),
  &_struc_setting('norand', qr/^\s*(yes|no)\s*$/),
  &_struc_setting('csites', qr/^\s*(\-?\d+)\s*$/, 1),
  &_struc_setting('strands', qr/^\s*(both|forward|none)\s*$/),
  &_struc_setting('brief', $int_trim_re),
  &_struc_setting('psp_file', $trim_re),
  &_struc_setting('priors_file', $trim_re),
  &_struc_setting('reason_for_stopping', $trim_re),
  #&_struc_setting('back_order', $int_trim_re),
  &_struc_alph_freqs('background_frequencies', [{ATR => 'source', VAL => $text_re},
	{ATR => 'order', VAL => $int_trim_re}])
];
my $ST_IN_TRSET = [ # training set
  &_struc_alphabet(ID2SYM_AS => 'id_to_sym', SYM2ID_AS => 'sym_to_id', ALPH_AS => 'alph'),
  {ELE => 'sequence', RPT => $RPT_ANY, ATRS => [
      {ATR => 'id', VAL => qr/^sequence_(\d+)$/},
      {ATR => 'name', VAL => $text_re},
      {ATR => 'length', VAL => $int_re},
      {ATR => 'weight', VAL => $pos_num_re}]},
  &_struc_alph_freqs('letter_frequencies')
];
my $ST_IN_MEME = [
  {ELE => 'training_set', TO => $ST_IN_TRSET, ATRS => [
      {ATR => 'primary_sequences', VAL => $text_re},
      {ATR => 'primary_count', VAL => $int_re},
      {ATR => 'primary_positions', VAL => $int_re},
      {ATR => 'control_sequences', VAL => $text_re},
      {ATR => 'control_count', VAL => $int_re},
      {ATR => 'control_positions', VAL => $int_re}
    ]
  },
  {ELE => 'model', TO => $ST_IN_MODEL, SILENT => 1, END => \&_call_handle_settings},
  {ELE => 'motifs', TO => [&_struc_motif()]},
  {ELE => 'scanned_sites_summary', RPT => $RPT_OPT, TO => $ST_IN_SSS, ATRS => [
      {ATR => 'p_thresh', VAL => $num_re}]}];
my $ST_START = [
  {ELE => 'MEME', NAME => 'meme', RPT => $RPT_ONE, TO => $ST_IN_MEME, ATRS => [
      {ATR => 'version', VAL => qr/^(\d+)\.(\d+)\.(\d+)$/, EXPAND => 3},
      {ATR => 'release', VAL => $text_re}
    ]}];

#
# Return the definition of a MEME XML file.
#
sub _xml_def {
  return $ST_START;
}

#
# Check value is either anr, oops, tcm or zoops.
# If anr is specified then convert it to tcm before returning.
#
sub _value_modtype {
  my ($self, $value) = @_;
  if ($value =~ qr/^\s*(anr|oops|tcm|zoops)\s*$/) {
    my $type = $1;
    $type = 'tcm' if ($type eq 'anr');
    return $type;
  }
  die("expected anr, oops, tcm or zoops\n");
}

sub _struc_setting {
  my ($element, $check, $opt) = @_;
  my $fn_store = sub {
    my ($self, $value) = @_;
    $self->{settings}->{$element} = $value;
  };
  return (
    {
      ELE => $element, 
      RPT => ($opt ? $RPT_OPT : $RPT_ONE),
      VAL => $check,
      SILENT => 1,
      END => $fn_store
    }
  );
}

sub _struc_contributing_site {
  my ($lflank, $seq, $rflank);
  my $fn_init = sub {
    my ($self, $attrs) = @_;
    $lflank = '';
    $seq = '';
    $rflank = '';
  };
  my $fn_left_flank = sub {
    my ($self, $left_flank) = @_;
    $lflank = $left_flank;
  };
  my $fn_letter_ref = sub {
    my ($self, $letter_id) = @_;
    $seq .= $self->{id_to_sym}->{$letter_id};
  };
  my $fn_right_flank = sub {
    my ($self, $right_flank) = @_;
    $rflank = $right_flank;
  };
  my $fn_contributing_site = sub {
    my ($self, $seq_num, $pos, $strand, $pvalue) = @_;
    return ($seq_num, $pos, $strand eq 'minus', $pvalue, $lflank, $seq, $rflank);
  };
  return (
    {
      ELE => 'contributing_site',
      NAME => 'site',
      HANDLE => $fn_contributing_site,
      WATCH => $fn_init,
      RPT => $RPT_ANY,
      ATRS => [
        {ATR => 'sequence_id', VAL => qr/^sequence_(\d+)$/},
        {ATR => 'position', VAL => $int_re},
        {ATR => 'strand', VAL => qr/^(plus|minus|none)$/},
        {ATR => 'pvalue', VAL => \&_value_pvalue}
      ],
      TO => [
        {ELE => 'left_flank', VAL => qr/^\s*($sym_re*)\s*$/, END => $fn_left_flank},
        {
          ELE => 'site',
          TO => [
            {
              ELE => 'letter_ref', 
              RPT => $RPT_ALO,
              END => $fn_letter_ref,
              ATRS => [
                {ATR => 'letter_id', VAL => $sym_id_re}
              ]
            }
          ]
        },
        {ELE => 'right_flank', VAL => qr/^\s*($sym_re*)\s*$/, END => $fn_right_flank}
      ]
    }
  );
}

sub _struc_alph_array {
  my ($callback, %options) = @_;
  my %values;
  my $fn_init = sub {
    %values = ();
  };
  my $fn_letter_value = sub {
    my ($self, $id, $value) = @_;
    if (defined($values{$id})) {
      $self->_error("Value repeated for letter ID \"$id\".");
    }
    $values{$id} = $value;
  };
  my $fn_result = sub {
    my ($self) = @_;
    my @alph_array = ();
    for (my $i = 0; $i < $self->{alph}->size_core(); $i++) {
      my $sym = $self->{alph}->char($i);
      my $id = $self->{sym_to_id}->{$sym};
      my $value = $values{$id};
      unless (defined($value)) {
        $self->_error("Value missing for letter ID \"$id\".");
        $value = 0;
      }
      $alph_array[$i] = $value;
    }
    $callback->(@alph_array);
  };
  return (
    {
      ELE => 'alphabet_array',
      RPT => $RPT_ALO,
      SILENT => 1,
      START => $fn_init,
      END => $fn_result,
      TO => [
        {
          ELE => 'value',
          RPT => $RPT_ALO,
          SILENT => 1,
          END => $fn_letter_value,
          ATRS => [
            {ATR => 'letter_id', VAL => $sym_id_re}
          ],
          VAL => $num_trim_re
        }
      ]
    }
  );
}
sub _struc_alph_freqs {
  my ($element, $atrs) = @_;
  $atrs = [] unless defined $atrs;
  my @freqs;
  my $fn_array = sub {
    @freqs = @_;
  };
  my $fn_result = sub {
    my ($self, @atrs) = @_;
    return (@atrs, \@freqs);
  };
  return (
    {
      ELE => $element,
      HANDLE => $fn_result,
      TO => [&_struc_alph_array($fn_array)], 
      ATRS => $atrs
    }
  );
}

sub _struc_alph_matrix {
  my ($callback, %options) = @_;
  my @alph_matrix;
  my $fn_init = sub {
    @alph_matrix = ();
  };
  my $fn_alph_array = sub {
    my @alph_array = @_;
    push(@alph_matrix, \@alph_array);
  };
  my $fn_result = sub {
    $callback->(@alph_matrix);
  };
  return (
    {
      ELE => 'alphabet_matrix',
      SILENT => 1,
      START => $fn_init,
      END => $fn_result,
      TO => [
        &_struc_alph_array($fn_alph_array)
      ]
    }
  );
}
sub _struc_motif {
  my ($motif_num, $name, $alt, $width, $sites, $llr, $ic, $re, $bayes_threshold, $evalue, $elapsed_time, $url, @scores, @probs, $regexp);
  my $fn_motif_init = sub {
    my ($self, $atrs);
    ($self, $atrs, $motif_num, $name, $alt, $width, $sites, $llr, $ic, $re, $bayes_threshold, $evalue, $elapsed_time, $url) = @_;
  };
  my $fn_scores = sub {
    @scores = @_;
  };
  my $fn_probs = sub {
    @probs = @_;
  };
  my $fn_regexp = sub {
    my ($self, $re) = @_;
    $regexp = $re;
  };
  my $fn_start_motif = sub {
    my ($self, $attrs) = @_;
    return (IDX => $motif_num - 1, NAME => $name, ALT => $alt, WIDTH => $width, SITES => $sites,
      LLR => $llr, IC => $ic, RE => $re, BAYES => $bayes_threshold,
      EVALUE => $evalue, ELAPSED => $elapsed_time, URL => $url,
      REGEXP => $regexp, PSM => \@scores, PWM => \@probs);
  };
  return (
    {
      ELE => 'motif',
      RPT => $RPT_ANY,
      SILENT => 1,
      START => $fn_motif_init,
      ATRS => [
        {ATR => 'id', VAL => qr/^motif_(\d+)$/},
        {ATR => 'name', VAL => $text_re},
        {ATR => 'alt', VAL => $text_re},
        {ATR => 'width', VAL => $int_re},
        {ATR => 'sites', VAL => $int_re},
        {ATR => 'llr', VAL => $num_re},
        {ATR => 'ic', VAL => $num_re},
        {ATR => 're', VAL => $num_re},
        {ATR => 'bayes_threshold', VAL => $num_re},
        {ATR => 'e_value', VAL => $num_re},
        {ATR => 'elapsed_time', VAL => $num_re},
        {ATR => 'url', OPT => qr/^(\S*)$/}
      ],
      TO => [
        {
          ELE => 'scores',
          SILENT => 1,
          TO => [&_struc_alph_matrix($fn_scores)]
        },
        {
          ELE => 'probabilities',
          SILENT => 1,
          TO => [&_struc_alph_matrix($fn_probs)]
        },
        {
          ELE => 'regular_expression',
          RPT => $RPT_OPT,
          SILENT => 1,
          END => $fn_regexp,
          VAL => qr/^\s*((?:$sym_re|\[$sym_re+\])+)\s*$/
        },
        {
          ELE => 'contributing_sites',
          NAME => 'motif', # by a bit of trickery we emit calls to start_motif, handle_site, end_motif
          START => $fn_start_motif,
          TO => [&_struc_contributing_site]
        }
      ]
    }
  );
}

sub _call_handle_settings {
  my ($self) = @_;
  my %settings = %{$self->{settings}};
  $self->_call('handle_settings', %settings);
}
