# File: PriorUtils.pm
# AUTHOR: Philip Machanick
# CREATE DATE: 28 July 2009 (the original prior_utils.pl)
# Description: common routines for creating priors
#
# FIXME: some hacks to work in this directory hierarchy
# including removing hypergeometric prior
#
package PriorUtils;

use strict;
use warnings;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(normalize_size log_background make_background
    normalized_classifier_scores printScoreAsPrior print_scores scale_scores
    class_prior_prob count_all_prob D_prior class_prior_norm hamming_prior
    uncount_ambigs count_all_w_mers count_all_norm count_w_mer
    count_w_mer_triples count_w_wide_triples count_present
    score_w_mer_hypergeometric extend_w_mer score_discrim_w_mer score_w_mer
    check_numeric roundup read_seq_file list_w_mers
    wmer2pspm read_prior_file);

use Scalar::Util qw(looks_like_number);
use Data::Dumper;

use ReadFastaFile qw(read_fasta_sequence);

my ($NAMEPOS, $SEQPOS, $COMMENTPOS) = (0, 1, 2); # order in read_fasta_sequence
my @ALLOWED_MODELS = ('oops', 'zoops', 'tcm');

################################################################################
#
#  normalize_size
#
#  returns as an array of 2 answers sum of normalized sizes of negative and
#  positive sequences respectively
#  Each sequence array is passed as reference to and array of
#  [name, sequencedata, comment]
#  approach: calculate add up mean length / length of each sequence
#
################################################################################

sub normalize_size {
    my ($neg_sequences,    # ref to array of ref to array
        $pos_sequences,    # ref to array of ref to array
        $L_mean            # average over all sequence lengths
    ) = @_;
    
    my ($N_neg_normalized, $N_pos_normalized) = (0,0);
  # recallibrate no. negative sequences to maximum value
  # of adding all normalized scores
  foreach my $seq_data (@$neg_sequences) {
    $N_neg_normalized += $L_mean/length($$seq_data[$SEQPOS]);
  }
  # same for positive sequences
  foreach my $seq_data (@$pos_sequences) {
    $N_pos_normalized += $L_mean/length($$seq_data[$SEQPOS]);
  }
  return ($N_neg_normalized, $N_pos_normalized);
}

################################################################################
#
#  log_background
#
#  convert background model to logs and as a bonus return an array of the
#  minimum and maximum letter probabilies
#
################################################################################

sub log_background {
    my ($backgroundmodel    # reference to hash of letter probabilies, indexed on letter
    ) = @_;
    my ($min_prob, $max_prob);
    foreach my $letter (keys %$backgroundmodel) {
        my $letter_prob = $$backgroundmodel{$letter};
        if (!defined ($min_prob) || $letter_prob < $min_prob) {
            $min_prob = $letter_prob;
        }
        if (!defined ($max_prob) || $letter_prob > $max_prob) {
            $max_prob = $letter_prob;
        }
        $$backgroundmodel{$letter} = log($letter_prob);
    }
    return ($min_prob, $max_prob);
}

################################################################################
#
#  make_background
#
#  returns a cumulative log background model as an array of references to
#  arrays, each of the referenced arrays the log cumulative background for
#  the sequence in the same position in the sequence array.
#  Datatypes:
#  input:   $alphabet: string of allowed letters
#           $sequences: reference to array of references to arrays
#                each of the contained arrays contains
#                [name, sequencedata, comment]
#  returns: reference to array of references to cumulative background
#            models in same order as sequence data, offset by 1 so that
#            position i+1 is sumulative probability for up to letter i
#            in the original sequence
#  Basic method:
#          For each letter in each sequence, look up its background
#          probability as read in by read_background_file and store
#          the log of that probability added to previous logs to give
#          a cumulative total up to that position in the sequence.
#          Allows the background for a w-mer to be computed using
#          one subtraction and one exponential.
#
################################################################################

sub make_background {
    my ($backgroundmodel,    # reference to hash of letter probabilies, indexed on letter
        $sequences           # reference to array of references 
    ) = @_;
    
    my @cumul_back_model = ();

    foreach my $seq_data (@$sequences) {
        $_ = $$seq_data[$SEQPOS];
        my @seqchars = split(//);
        my $last = 0;
        # make thefirst element 0; position j+1 represents cumulative probability up to j
        my @log_cumul_background = ( 0 );
        foreach my $char (@seqchars) {
            die "`$char' not in background" unless defined($$backgroundmodel{$char});
            $last += $$backgroundmodel{$char};
            push (@log_cumul_background, $last);
        }
        push (@cumul_back_model, \@log_cumul_background);
    }
#    print STDERR "cumul background model: ".Dumper(\@cumul_back_model);

  return \@cumul_back_model;
}


################################################################################
#
#  normalized_classifier_scores
#
#  returns a reference to a hash indexed on sequence name containing
#  scores for each name, represented as reference to an array of numbers
#  Each sequence array is passed as reference to and array of
#  [name, sequencedata, comment]
#  approach: calculate add up mean length / length of each sequence
#
#  for triples, we also need the scores for the last $width
#
################################################################################
sub normalized_classifier_scores {
    my ($pos_sequences,     # ref to array each element a sequence (string)
        $pos_dictionary,    # ref to hash indexed on w-mer: count of w-mer in pos set
        $neg_dictionary,    # ref to hash indexed on w-mer: count of w-mer in neg set
        $width,             # current motif width under consideration
        $N_pos,             # size of positive set (# sequences)
        $N_neg,             # size of negative set (# sequences)
        $N_pos_normalized,  # normalized size of positive set
        $N_neg_normalized,  # normalized size of negative set
        $min_score,         # ref to scalar: to return min score from score_w_mer
        $max_score,         # ref to scalar: to return max score from score_w_mer
        $scale,             # if true scale value to standard range
        $absolute,          # if true, don't weight for number of sequences
        $type,              # if true, score discriminative or hypergeometric prior
        $triples,           # if true score spaced triples rather than w-mers
        $fixed_start,       # if true score spaced triples with start position fixed
        $lastscores         # scores for $width-1 used to avoid recomputing triples scores
    ) = @_;
    
    my %prior_values;
    my $i = 0;
    my %score_cache;

    foreach my $seq_data (@$pos_sequences) {
      my @scores;
      # scores are stored in a hash table hashed on sequence name
      # sequence names and data are stored in an array indexed by $NAMEPOS and $SEQPOS
      my $last_i_scores = defined ($lastscores) ? $$lastscores{@$seq_data[$NAMEPOS]} : undef;
      if ($type && $type eq "D") { # FIXME: only D priors work for protein
          score_discrim_w_mer (\@scores, @$seq_data[$SEQPOS], $pos_dictionary, $neg_dictionary, $width,
            $N_pos, $N_neg, $N_pos_normalized, $N_neg_normalized, $min_score, $max_score, $scale,
            $absolute, $triples, $fixed_start, $last_i_scores, \%score_cache);
      } elsif ($type && $type eq "hyper") { # also covers case of D-hyper 
          &score_w_mer_hypergeometric (\@scores, @$seq_data[$SEQPOS], $pos_dictionary,
            $neg_dictionary, $width,
            $N_pos, $N_neg, $N_pos_normalized, $N_neg_normalized, $min_score, $max_score, $scale,
            $absolute);  #FIXME: doesn't do triples
      } else { # FIXME: triples not updated to latest stuff in score_discrim_w_mer
          &score_w_mer (\@scores, @$seq_data[$SEQPOS], $pos_dictionary, $neg_dictionary, $width,
            $N_pos, $N_neg, $N_pos_normalized, $N_neg_normalized, $min_score, $max_score, $scale,
            $absolute, $triples, $fixed_start, $last_i_scores, \%score_cache);
      }
      my $name = $$seq_data[$NAMEPOS];
#       print STDERR "scores (total # =". @scores. @$seq_data[$NAMEPOS].":\n".Dumper(\@scores);
      $prior_values{@$seq_data[$NAMEPOS]} = [@scores];
    }
    return \%prior_values;
}

################################################################################
#
#  printScoreAsPrior
#
#  convert scores to prior values; OOPS, ZOOPS and TCM models
#  Datatypes:
#  input:  $allscores: reference to hash of scores indexed on FASTA name
#          each element a reference to array of numbers
#          $model: should be "zoops", "oops" or "tcm"
#          $width: width of motif for which prior computed
#          $scale_min, $scale_max: values used to rescale scores before
#          converting to priors (from [0..1] to [$scale_min .. $scale_max])
#          $outfile: optional path to output file; defaults to STDOUT
#          $names: reference to array of sequence names, use to order
#          output if not undef
#  output: to stdout unless $outfile defined
#          >name 
#  Basic method (oops, zoops):
#          For each sequence, rescale scores to form a probability distribution.
#          For Zoops, K_0 = 1, OOPS K_0 = 0 representing a score
#          for finding no motif in a given sequence.
#          For j = 1..L-w+1: alpha = sum K_j = score_j/(1-score_j)
#          Each prior value p_j = K_j / alpha
#
#          TCM variation: save rescaling until all scores seen and rescale to
#          form a single distirbution
#
#          FIXME: OOPS, TCM not tested
#  
#
################################################################################
sub printScoreAsPrior {
    my ($allscores,
        $model,
        $width,
        $scale_min,
        $scale_max,
        $outfile,
        $names) = @_;
    my $alpha = 0;    # for TCM, don't reset in counting loop
    my %allK;         # only used for TCM
    my $OK = 0;    # test if model string OK:
    foreach my $m (@ALLOWED_MODELS) {
        if ($m eq $model) {
            $OK = 1;
            last;
        }
    }
    $outfile = "-" unless ($outfile); # STDOUT if not defined
    open(OUTFILE, ">$outfile") or die ("failed to open $outfile\n");
    die "invalid model `$model' in printScoreAsPrior" unless $OK;
    $names = [sort keys %$allscores] if !$names;
    foreach my $name (@$names) {
      if ($model ne 'tcm') {    # keep accumulating over all seqs for TCM
          $alpha = $model eq 'zoops' ? 1 : 0; # represents K_0 in following summation
      }
      my @K = ();
      my $L = @{$$allscores{$name}}; #$nonzeros = @{$$allscores{$name}} - $width + 1;
      for (my $j = 0; $j < $L; $j++) {
        my $P_ij = ${$$allscores{$name}}[$j];
        push(@K,$P_ij/(1-$P_ij));
        $alpha += $K[-1];  # add on last K
      }
      
      if ($model ne 'tcm') {
          &print_prior_seq ($name, $width, $scale_min, $scale_max, \@K, $alpha, *OUTFILE);
      } else  { #tcm
          $allK{$name} = \@K;
      }
    }
    # we now have a basis for scaling all values over all sequences to a
    # single probability distribution
    if ($model eq 'tcm') {
        foreach my $name (@$names) {
            &print_prior_seq ($name, $width, $scale_min, $scale_max, $allK{$name}, $alpha, *OUTFILE);
        }
    }
}

################################################################################
#
#  print_prior_seq
#  Datatypes:
#  input:  $name: FASTA name of original sequence
#          $width: motif width for prior computed
#          $epsilon: adjustment of original range of scores
#          $K: ref to array of numbers -- priors before final
#          scaling to form probability distribution
#          $alpha: to divide $K values to form probabilities
#          $outfile: optional path to output file; defaults to STDOUT
#  output: stdout PSP file format
#    >name $width epsilon = $epsilon
#    number number ... number
#  Basic method: divides $K values to form probabilities
#  Complexity: O(L) where L is sequence length
#
################################################################################

sub print_prior_seq {
    my ($name,    # name from original FASTA file
    $width,       # width or motif
    $scale_min,   # adjustment to original value range
    $scale_max,
    $K,           # reference to array of partially computed priors
    $alpha,       # value to divide K values to convert to probabilities
    $outfile      # file handle if defined
    ) = @_;
    local *OUTFILE;
    if ($outfile) {
        *OUTFILE = $outfile;
    } else {
        *OUTFILE = \*STDOUT;
    }
    printf OUTFILE '>'.$name." $width scaledmin = %G scaledmax = %G\n", $scale_min, $scale_max;
    foreach my $k_val (@$K) {
      printf OUTFILE "%8.7G ", $k_val / $alpha; # was 20.19lG in older versions
    }
    # now print terminating w-1 zeros
    for (my $i = 0; $i < $width - 2; $i++) {
      print OUTFILE "0 ";
    }
    print OUTFILE "0\n";
}

################################################################################
#
#  print_scores
#
#  print scores after rescaling to the given range
#
################################################################################
sub print_scores {
    my ($allscores,     # reference to hash indexed on names, each element its scores
        $scale_min,     # min scaled value
        $scale_max,     # max scaled value
        $min_score,
        $max_score,
        $width,         # width of motif
        $scale          # if true, scaled to standard range
    ) = @_;

    # sort the names here to ensure output is in a consistent order
    foreach my $name (sort keys %$allscores) {
      print ">".$name."\n";
      my $scores = $$allscores{$name};
      foreach my $sc (@$scores) {
        if ($scale && $sc > $scale_max) {
          print STDERR "printing $sc > $scale_max max_score = $max_score; min_score = $min_score\n";
        }
        printf "%.19G ", $sc;
      }
      for (my $i = 0; $i < $width - 1; $i++) {
        print "0";
        print " " unless ($i == $width - 2);
      }
      printf "\n";
    }
    
}

################################################################################
#
#  scale_scores
#
#  scale scores to the standard range
#  in this case $allscores is used to return results as well as pass in
#
################################################################################
sub scale_scores {
    my ($allscores,     # reference to hash indexed on names, each element its scores
        $scale_min,     # min scaled value
        $scale_max,     # max scaled value
        $min_score,     # previously determined minimum score
        $max_score,     # previously determined maximum score
        $verbose        # if true, stats to STDERR
    ) = @_;

    my %scorestats;

    # sort the names here to ensure output is in a consistent order
    foreach my $name (sort keys %$allscores) {
      my $scores = $$allscores{$name};
      for (my $j = 0; $j < @$scores; $j++) {
          my $sc = $$scores[$j];
          die "scaling scores but $max_score <= $min_score" unless ($max_score >= $min_score);
          if ($max_score == $min_score) {
              $sc = $scale_max;
          } else {
              $sc = ($sc - $min_score)/($max_score-$min_score)*($scale_max-$scale_min)+$scale_min;
          }
          die "scaled score $sc out of range [$scale_min..$scale_max]"
              unless ($sc >= $scale_min && $sc <= $scale_max);
        if ($verbose) {
          if (exists $scorestats{$sc}) {
            $scorestats{$sc}++;
            } else {
              $scorestats{$sc} = 1;
            }
        }
        $$scores[$j] = $sc;    # save the scaled score
      }
    }
    
    if ($verbose) {
      print STDERR "\n";
      foreach my $sc (sort keys(%scorestats)) {
        print STDERR "score $sc occurred $scorestats{$sc} times\n";
      }
    }
}


################################################################################
#
#  class_prior_prob
#
#  use classification of each w-mer in terms of the number of positive set
#  sequences in which it occurs plus the number of negative set sequences in
#  which it does not occur as a fraction of the number of sequences.
#  Datatypes:
#  input:  $pos_sequences: reference to array of positive sequences, each 
#          element a string of letters of the allowed alphabet
#          $neg_sequences: same type as $pos_sequences representing negative
#          sequences
#          $revcomp: true if reverse complement required
#          $width: width of motif under consideration
#          normalizing not required
#  output: $pos_dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#          $neg_dictionary: same for negative sequences
#  Basic method: Normalizes count as average seq length / length of seq in which
#  w-mer found (unless mean length not defined, in which case presence count
#  is 1).
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  Detail at count_all_norm.
#
################################################################################
sub class_prior_prob {
  my ($pos_sequences,   # reference to array of positive sequence data
      $neg_sequences,   # reference to array of negative sequence data
      $alph,            # alphabet
      $revcomp,         # if true, also count reverse complement of w-mer
      $width,           # motif width
      $pos_dictionary,  # ref to hash count positive seqs where each w-mer seen
      $neg_dictionary,  # ref to hash count negative seqs where each w-mer seen
      $pos_log_cuml_bg, # ref to log cumulative background probabilities for pos seqs
      $neg_log_cuml_bg, # ref to log cumulative background probabilities for pos seqs
      $min_bg_prob      # lowest background probability of any letter
  ) = @_;
    # count the number of sequences in which a w-mer occurs first in the positive
    # then in the negative sequences; the count is weighted for the probabilty
    # of not finding the w-mer based on the background model
    return (&count_all_prob($pos_sequences, $alph, $revcomp, $width, $pos_dictionary,
        $pos_log_cuml_bg, $min_bg_prob),
        &count_all_prob($neg_sequences, $alph, $revcomp, $width, $neg_dictionary,
        $neg_log_cuml_bg, $min_bg_prob));
}


################################################################################
#
#  count_all_prob FIXME DOCUMENTATION NOT CORRECT
#  FIXME: doesn't do later features like spaced triples
#
#  for each w_mer found in a given sequence, pass in to count_w_mer
#  returns sum of all available scores / number of bases visited
#  
#  Datatypes:
#  input:  $sequences: reference to array of sequences, each a string
#          of letters of the allowed alphabet
#          $revcomp: true if reverse complement required
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#
#  Basic method: for each sequence, visit each w-mer once and if it
#  has not been encountered before in the current sequence, increment
#  its count (initialize if never seen before), and store the position
#  of the sequence as "last" to remember where it was last seen.
#  Count is 1 in absolute mode or (1 - p_a)^(L-w+1) for background
#  probabiliy of letter a p_a, sequence length L, motif width w) in
#  probabilistic normalization mode.
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  There is variation for w because there is some caching of precomputed
#  matches so a larger w with more unique w-mers will result in slower
#  execution.
#
################################################################################

sub count_all_prob {
  my ($sequences,     # reference to array of sequence data
      $alph,          # alphabet
      $revcomp,       # if true, also count reverse complement of w-mer
      $width,         # motif width
      $dictionary,    # count of sequences where each w-mer seen
      $log_cuml_bg,   # log cumulative bg for all sequences in this set
      $min_bg_prob    # lowest background probability for any letter
  ) = @_;

  # $i used to ensure we only count a w-mer once per sequence plus index background
  # seq_bg[$j+$w-1] - @seq_bg[$j-1]
  my $i = 0;
  my $maxscore = 0;
  my $min_prob_for_w = 1 - ($min_bg_prob ** $width);
#  print STDERR "in count_all_prob".defined($version)?$version." ":""."\n";
  foreach my $seqdata (@$sequences) {
    my $seq = $$seqdata[$SEQPOS];
    my $bg = @$log_cuml_bg[$i];
    my $L_w_1 = length($seq) - $width + 1; # L-w+1
    for (my $j = 0; $j < $L_w_1; $j++) {
      # subtract cumulative log prob before start of this w-mer then
      # use exp to get probability of this w-mer; cumulative probability
      # vector has a 0 at the front so we don't have to special-case dealing
      # with item 0, i.e. all indexes offset by +1. Cumulative probability for
      # position j+w-1 in sequence is at j+w in cumulative probability
      my $prob = exp ($$bg[$j+$width] - $$bg[$j]);
      my $count = (1-$prob) ** $L_w_1;
      # print STDERR "p = $prob, '(1-$prob)' = (1-$prob), L-w+1 = $L_w_1, s = $count;";
      my $w_mer = substr($seq, $j, $width);
      &count_w_mer ($w_mer, $alph, $revcomp, $i, $count, $dictionary);
    }
    # probability for 
    $maxscore += $min_prob_for_w ** $L_w_1;
    $i++;
  }
  return $maxscore;
}


################################################################################
#
#  D_prior
#
#  use discrimination of each w-mer in times it appears in all positive set
#  sequences versus the number of times it occurs in each negative set sequences,
#  as a fraction of the number of sequences.
#  Datatypes:
#  input:  $pos_sequences: reference to array of positive sequences, each 
#          element a string of letters of the allowed alphabet
#          $neg_sequences: same type as $pos_sequences representing negative
#          sequences
#          $revcomp: true if reverse complement required
#          $meanL: number, mean over all sequence lengths (pos+neg); undef if
#          normalizing not required
#  output: $pos_dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#          $neg_dictionary
#  Basic method: counts number of occurrence of each word in positive and negative
#  sets, and calculates a score for each word that is the fraction of all those
#  occurrences that are in the positive set. Here, we do the count; the fraction is
#  calculated in normalized_classifier_scores (so named because other methods may
#  do normalizing).
#  
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  Data structures for counting described at count_all_w_mers.
#
################################################################################
sub D_prior {
  my ($pos_sequences,   # reference to array of positive sequence data
      $neg_sequences,   # reference to array of negative sequence data
      $alph,            # alphabet
      $revcomp,         # if true, also count reverse complement of w-mer
      $width,           # motif width
      $pos_dictionary,  # ref to hash count positive seqs where each w-mer seen
      $neg_dictionary,  # ref to hash count negative seqs where each w-mer seen
      $triples          # if true score spaced triples rather than w-mers
  ) = @_;

    # count the number of times each w-mer occurs first in the positive
    # then in the negative sequences
    &count_all_w_mers($pos_sequences, $alph, $revcomp, $width, $pos_dictionary, $triples);
    #print STDERR "pos counts including ambigs: ".Dumper(\$pos_dictionary);
    &uncount_ambigs  ($pos_dictionary, $alph, $revcomp, $triples);
    #print STDERR "pos counts excluding ambigs: ".Dumper(\$pos_dictionary);
    &count_all_w_mers($neg_sequences, $alph, $revcomp, $width, $neg_dictionary, $triples);
    #print STDERR "neg counts including ambigs: ".Dumper(\$neg_dictionary);
    &uncount_ambigs  ($neg_dictionary, $alph, $revcomp, $triples);
    #print STDERR "neg counts excluding ambigs: ".Dumper(\$neg_dictionary);
} # D_prior


################################################################################
#
#  class_prior_norm
#
#  use classification of each w-mer in terms of the number of positive set
#  sequences in which it occurs plus the number of negative set sequences in
#  which it does not occur as a fraction of the number of sequences.
#  Datatypes:
#  input:  $pos_sequences: reference to array of positive sequences, each 
#          element a string of letters of the allowed alphabet
#          $neg_sequences: same type as $pos_sequences representing negative
#          sequences
#          $revcomp: true if reverse complement required
#          $meanL: number, mean over all sequence lengths (pos+neg); undef if
#          normalizing not required
#  output: $pos_dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#          $neg_dictionary
#  Basic method: Normalizes count as average seq length / length of seq in which
#  w-mer found (unless mean length not defined, in which case presence count
#  is 1).
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  Detail at count_all_norm.
#
################################################################################
sub class_prior_norm {
  my ($pos_sequences,   # reference to array of positive sequence data
      $neg_sequences,   # reference to array of negative sequence data
      $alph,            # alphabet
      $revcomp,         # if true, also count reverse complement of w-mer
      $width,           # motif width
      $meanL,           # previously computed average sequence length
      $pos_dictionary,  # ref to hash count positive seqs where each w-mer seen
      $neg_dictionary,  # ref to hash count negative seqs where each w-mer seen
      $triples          # if true score spaced triples rather than w-mers
  ) = @_;

    # count the number of sequences in which a w-mer occurs first in the positive
    # then in the negative sequences; the count is weighted for the length of the
    # sequence in which the w-mer is found
    &count_all_norm($pos_sequences, $alph, $revcomp, $width, $meanL, $pos_dictionary, $triples);
    &count_all_norm($neg_sequences, $alph, $revcomp, $width, $meanL, $neg_dictionary, $triples);
}

sub hamming_prior {
  my ($pos_sequences,   # reference to array of positive sequence data
      $neg_sequences,   # reference to array of negative sequence data
      $revcomp,         # if true, also count reverse complement of w-mer
      $width,           # motif width
      $meanL,           # previously computed average sequence length
      $pos_dictionary,  # ref to hash count positive seqs where each w-mer seen
      $neg_dictionary,  # ref to hash count negative seqs where each w-mer seen
      $triples,         # if true score spaced triples rather than w-mers
      $min_score,       # reference: return value through this
      $max_score        # reference: return value through this
  ) = @_;

    my %prior_values;
    # score each word for how similar it is to other words in the positive and
    # negative sets. An exact match scores 1; any other match scores matches/w
    for (my $i = 0; $i <  @$pos_sequences; $i++) { # for all positive sequences
      my @scores = ();
      my $seq_data = $$pos_sequences[$i];
      $_ = $$seq_data[$SEQPOS];
      my @seq = split(//);
      my $L_wP1 = @seq - $width + 1;
      # for all words width w in a given sequence
      for (my $j = 0; $j < $L_wP1; $j++) {
          my $j_end = $j + $width - 1;
          my ($pos_count, $neg_count) = (0, 0);
          # find the comparison word and score
          for (my $p = 0; $p <  @$pos_sequences; $p++) {    # for all positive sequences again
              my $comparison;
              if ($p == $i) {    # don't split the array again, reuse
                  $comparison = \@seq;
              } else {
                  my $comparisonseqdata = $$pos_sequences[$p];
                  $_ = $$comparisonseqdata[$SEQPOS];
                  $comparison = [split(//)];
              }
              # compare original + comparison (OK to count self as 1 == all match)
              my $LComp_wP1 = @$comparison - $width + 1;
              my $stop = $LComp_wP1<$L_wP1?$LComp_wP1:$L_wP1;
              # find all words to compare with given word
              my $max_matches = 0;
              for (my $k = 0; $k < $stop; $k++) { # stop based on shorter of 2 sequences
                my $matches = 0;
                my ($k_pos, $k_end) = ($k, $k + $width - 1);
                for (my $pos = $j; $pos < $j_end; $pos++) { 
                  $matches ++ if ($seq[$j] eq $$comparison[$k_pos]);
                  $k_pos++;
                }
                # record the best match count for this word
                $max_matches = $matches if $matches > $max_matches;
              }
              $pos_count += $max_matches/$width;
          }

          for (my $n = 0; $n <  @$neg_sequences; $n++) {    # for all negative sequences now
              my $comparisonseqdata = $$neg_sequences[$n];
              $_ = $$comparisonseqdata[$SEQPOS];
              my $comparison = [split(//)];
              # compare original + comparison (OK to count self as 1 == all match)
              my $matches = 0;
              my $LComp_wP1 = @$comparison - $width + 1;
              my $stop = $LComp_wP1<$L_wP1?$LComp_wP1:$L_wP1;
              # find all words to compare with given word
              my $max_matches = 0;
              for (my $k = 0; $k < $stop; $k++) { # stop based on shorter of 2 sequences
                my $matches = 0;
                my ($k_pos, $k_end) = ($k, $k + $width - 1);
                for (my $pos = $j; $pos < $j_end; $pos++) { 
                  $matches ++ if ($seq[$j] eq $$comparison[$k_pos]);
                  $k_pos++;
                }
                # record the best match count for this word
                $max_matches = $matches if $matches > $max_matches;
              }
              $neg_count += 1 - ($max_matches/$width);
          }
          my $score = ($pos_count + $neg_count) / (@$pos_sequences + @$neg_sequences);
          push (@scores, $score);
          $$min_score = $score if (!$$min_score || $score < $$min_score);
          $$max_score = $score if (!$$max_score || $score > $$max_score);
      }
          $prior_values{@$seq_data[$NAMEPOS]} = [@scores];
    }
    return \%prior_values;
}


################################################################################
# uncount_ambigs
#
# find each word that contains letters in the ambigous set and set its count
# to 1, so it gets the lowest score possible for any word encountered (not strictly
# true as we should give it the highest possible count in the negative set for that
# to be literally true).
# input:  $ambigs: letters in the ambiguous set (string)
#          $revcomp: true if reverse complement required
# output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#
################################################################################

sub uncount_ambigs {
  my ($dictionary,    # count of sequences where each w-mer seen
      $alph,          # alphabet
      $revcomp,       # if true fix both strands
      $triples	      # if true words are format: "i,j:x.y.z"
  ) = @_;
  foreach my $word (keys %$dictionary) {
    my $key = $word;
    my ($i, $j);
    if ($triples) { 
      $word =~ m/(\d+),(\d+):(.)\.(.)\.(.)/;
      $word = "$3$4$5";
      $i = $1;
      $j = $2;
    }
    if ($alph->is_part_ambig_seq($word)) {
      $dictionary->{$key}->{"count"} = 1;
      if ($revcomp) {
        my $reverse = $alph->rc_seq($word);
        if (exists ($dictionary->{$reverse})) {
          my $rev_key = $triples ? "$j,$i:$reverse" : $reverse;
          $dictionary->{$rev_key}->{"count"} = 1;
        }
      }
    }
  }
}


################################################################################
#
#  count_all_w_mers
#
#  for each w_mer found in a given sequence, pass in to count_w_mer
#  Datatypes:
#  input:  $sequences: reference to array of sequences, each a string
#          of letters of the allowed alphabet
#          $revcomp: true if reverse complement required
#          $meanL: number, mean over all sequence lengths (pos+neg)
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#  Basic method: for each sequence, visit each w-mer once and increment
#  its count (initialize if never seen before) in a hash table.
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  There is variation for w because there is some caching of precomputed
#  matches so a larger w with more unique w-mers will result in slower
#  execution.
#
################################################################################

sub count_all_w_mers {
  my ($sequences,     # reference to array of sequence data
      $alph,          # alphabet
      $revcomp,       # if true, also count reverse complement of w-mer
      $width,         # motif width
      $dictionary,    # count of sequences where each w-mer seen
      $triples        # if true score spaced triples rather than w-mers
  ) = @_;

  my $i = 0; # used to ensure we only count a w-mer once per sequence
  foreach my $seqdata (@$sequences) {
    my $seq = $$seqdata[$SEQPOS];
    my $L_wP1 = length($seq) - $width + 1;
    my $count = 1; # always increment by 1
    for (my $j = 0; $j < $L_wP1; $j++) {
      my $w_mer = substr($seq, $j, $width);
      if ($triples) {
        &count_w_wide_triples ($w_mer, $i, $count, $dictionary);
      } else { # count_w_mer_triples for don't cares in any position
        &count_w_mer ($w_mer, $alph, $revcomp, $i, $count, $dictionary);
      }
      $i++;    # finagle the "last seen" index so we count every instance
    }
  }
}


################################################################################
#
#  count_all_norm
#
#  for each w_mer found in a given sequence, pass in to count_w_mer
#  Datatypes:
#  input:  $sequences: reference to array of sequences, each a string
#          of letters of the allowed alphabet
#          $revcomp: true if reverse complement required
#          $meanL: number, mean over all sequence lengths (pos+neg)
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#  Basic method: for each sequence, visit each w-mer once and if it
#  has not been encountered before in the current sequence, increment
#  its count (initialize if never seen before), and store the position
#  of the sequence as "last" to remember where it was last seen.
#  Count is 1 in absolute mode or (mean sequence length) / (this sequence
#  length) in normalized mode.
#  Complexity: O(LN) where L is sequence length and N # sequences.
#  There is variation for w because there is some caching of precomputed
#  matches so a larger w with more unique w-mers will result in slower
#  execution.
#
################################################################################

sub count_all_norm {
  my ($sequences,     # reference to array of sequence data
      $alph,
      $revcomp,       # if true, also count reverse complement of w-mer
      $width,         # motif width
      $meanL,         # previously computed average sequence length
      $dictionary,    # count of sequences where each w-mer seen
      $triples        # if true count spaced triples rather than w-mers
  ) = @_;

  my $i = 0; # used to ensure we only count a w-mer once per sequence
  foreach my $seqdata (@$sequences) {
    my $seq = $$seqdata[$SEQPOS];
    my $count = defined($meanL) ? $meanL/length($seq) : 1;
    for (my $j = 0; $j < length($seq) - $width + 1; $j++) {
      my $w_mer = substr($seq, $j, $width);
      if ($triples) {
          &count_w_wide_triples ($w_mer, $i, $count, $dictionary);
      } else { # count_w_mer_triples for don't cares in any position
          &count_w_mer ($w_mer, $alph, $revcomp, $i, $count, $dictionary);
      }
    }
    $i++;
  }
}


################################################################################
#
#  count_w_mer
#
#  for each w_mer found in a given sequence, increment dictionary count
#  (or initialize if not already present) unless already seen in the same
#  sequence: relies on processing the sequences in order. 
#  Datatypes:
#  input:  $w_mer: string containing w-mer to be counted
#          $revcomp: true if reverse complement required
#          $i: index of current sequence to avoid double-counting
#          $count: amount to increment score
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#  Basic method: Value of each count is 1 if abs mode otherwise normalized
#  by seq length.
#  Whichever of the w-mer or reverse complement of a w-mer is found first
#  will set the dictionary entry; from there on, only that entry will be
#  the same (if $revcomp) because it is stored as a reference in
#  bother entries each time (again if $revcomp).
#
################################################################################

sub count_w_mer {
  my (
      $w_mer,        # the w_mer we are currently counting
      $alph,
      $revcomp,      # if true, also count if revcomp($w_mer present)
      $i,            # index unique to a sequence to enforce single counts
      $count,        # 1 or scaled score depending on score method
      $dictionary    # indexed on w-mer, score and last i per entry
  ) = @_;
  my $entry;
  my $reverse;
  if ($revcomp) {
    $reverse = $alph->rc_seq($w_mer);
  }
  # score 1 or scaled score for each time present in $seq
  if (exists ($dictionary->{$w_mer})) {
    $entry = $dictionary->{$w_mer};
  } elsif ($revcomp && exists ($dictionary->{$reverse})) {
    $entry = $dictionary->{$reverse};
  }
  if (defined($entry)) {
    &count_present ($entry, $i, $count);
  } else {
    $entry = {count => $count, last => $i};
  }
  # here we don't care if we found it as forward or revcomp;
  # store a reference to the entry, so it's updated in either
  # case; we want it stored at the location where the actual
  # w-mer was found so it will be found again when using the
  # scores. When we hit the first revcomp instance it will be
  # copied there too.
  $dictionary->{$w_mer} = $entry;
  if ($revcomp) {
    $dictionary->{$reverse} = $entry;
  }
}


################################################################################
#
#  count_w_mer_triples
#
#  for each w-mer found in a given sequence, increment dictionary count
#  (or initialize if not already present) unless already seen in the same
#  sequence: relies on processing the sequences in order. Intended for protein,
#  this variation counts all triples within a w-mer, ignoring letters between
#  the triples, e.g., for VDFMVQSPNEKP, with w=4, would count triples
#  VDF. VD.M V.FM .DFM in the first w-mer where "." == don't care.
#  Datatypes:
#  input:  $w_mer: string containing w-mer to be counted
#          $i: index of current sequence to avoid double-counting
#          $count: amount to increment score
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#  Basic method: Value of each count is 1 if abs mode otherwise normalized
#  by seq length.
#  Whichever of the w-mer or reverse complement of a w-mer is found first
#  will set the dictionary entry; from there on, only that entry will be
#  the same (if $revcomp) because it is stored as a reference in
#  bother entries each time (again if $revcomp).
#
################################################################################

sub count_w_mer_triples {
  my (
      $w_mer,        # the w_mer we are currently counting
      $last,         # index unique to a sequence to enforce single counts
      $count,        # 1 or scaled score depending on score method
      $dictionary    # indexed on w-mer, score and last i per entry
  ) = @_;
  my @letters = split(//,$w_mer);
  my $width = @letters;
  my ($w_1, $w_2) = ($width-1, $width-2);
  
  for (my $i = 0; $i < $w_2; $i++) {
    for (my $j = $i+1; $j < $w_1; $j++) {
        for (my $k = $j+1; $k < $width; $k++) {
            my $entry;
            my $found = 0;
            # separate i,j,k with "," to avoid need to worry about digits/number
            my $key = $i.",".$j.",".$k.":$letters[$i].$letters[$j].$letters[$k]";
            if (exists($dictionary->{$key})) {
                $entry = $dictionary->{$key};
                &count_present ($entry, $last, $count);
                $found = 1;          
            }
            if (!$found) {
              my %new_entry;
              $new_entry{"count"} = $count;
              $new_entry{"last"} = $last;
              $entry = \%new_entry;
            }
            $dictionary->{$key} = $entry;
        }
    }
  }

}

################################################################################
#
#  count_w_wide_triples
#
#  for each w-wide triple in a given sequence, increment dictionary count
#  (or initialize if not already present) unless already seen in the same
#  sequence: relies on processing the sequences in order. Intended for protein,
#  counts all triples within a w-mer starting and ending respectively at the
#  start and end point of the w-mer, ignoring letters between.
#  Example: for VDFMVQSPNEKP, with w=4, would count triples
#  VD.M and V.FM in the first w-mer where "." == don't care.
#  Datatypes:
#  input:  $w_mer: string containing w-mer to be counted
#          $i: index of current sequence to avoid double-counting
#          $count: amount to increment score
#  output: $dictionary: reference to hash indexed on w-mer, with entries
#          each also a hash containing values indexed on "count" (count
#          of that w-mer) and "last" (index of previous place w-mer seen)
#  Basic method: Value of each count is 1 if abs mode otherwise normalized
#  by seq length.
#  Whichever of the w-mer or reverse complement of a w-mer is found first
#  will set the dictionary entry; from there on, only that entry will be
#  the same (if $revcomp) because it is stored as a reference in
#  bother entries each time (again if $revcomp).
#
################################################################################

sub count_w_wide_triples {
  my (
      $w_mer,        # the w_mer we are currently counting
      $last,         # index unique to a sequence to enforce single counts
      $count,        # 1 or scaled score depending on score method
      $dictionary    # indexed on w-mer, score and last i per entry
  ) = @_;
  
  my @letters = split(//,$w_mer);
  my $w_1 = @letters-1;
  my ($start, $end) = ($letters[0], $letters[-1]);
  for (my $j = 1; $j < $w_1; $j++) {
    my $entry;
    my $key = "$j,$w_1:$start.$letters[$j].$end";
    if (exists($dictionary->{$key})) {
      $entry = $dictionary->{$key};
      &count_present ($entry, $last, $count);
    } else {
      $entry = {count => $count, last => $last};
    }
    $dictionary->{$key} = $entry;
  }
}

#&count_w_mer_triples ($w_mer, $last, $count, $dictionary);

################################################################################
#
#  count_present
#
#  Given an entry in the dictionary, count it if not seen previously in
#  this sequence. If you want to count all occurrences, imcrement $last
#  for each call.
#
################################################################################
sub count_present {
  my (
      $entry,    # dictionary entry to update, reference to hash
      $last,     # current sequence number
      $count     # amount to increment the score
  ) = @_;
  if ($$entry{"last"} != $last) { # only count once per seq
    $$entry{"last"} = $last;
    $$entry{"count"} += $count;
  }
}


################################################################################
#
#    score_w_mer_hypergeometric  FIXME: omitted from this version
#
#    for each position for the w_mer starting there, look its score up in the
#    positive and negative dictionaries and calculate the overall score  as
#    (log(p)) where
#    p = hypergeometric (score_pos, score_pos+score_neg, N_pos, N_pos+N_neg)
#    This classifies each w-mer as to how strongly it discriminates the
#    positive sequences.
#    FIXME: slow so not implemented for later features like spaced triples
#
################################################################################

sub score_w_mer_hypergeometric {
#  print STDERR "\n";
}

################################################################################
#
#    extend_w_mer
#
#    given scored triples, record pairs representing positions 1,2 and 1,3 with
#    the missing letter -- EXPERIMENTAL, NOT USED
#
################################################################################

sub extend_w_mer {
  my ($seq,              # string containing sequence
      $dictionary,       # for each w_mer in given set, how many sequences it's in
                         # key is w-mer, entries each a hash: "count" relevant here
      $width,            # width of w_mer under consideration
      $N_pos,            # number of positive sequences
      $N_neg,            # number of negative sequences
      $N_pos_normalized, # number of positive sequences normalized for maximum total count
      $N_neg_normalized, # number of negative sequences normalized for maximum total count
      $min_score,        # reference to scalar: highest score calculated here
      $max_score,        # reference to scalar: lowest score calculated here
      $scale,            # if true, calculate $min_score, $max_score
      $absolute,         # if true, don't weight for number of sequences
      $triples,          # if true score spaced triples rather than w-mers
      $fixed_start       # if true score spaced triples with start position fixed
    ) = @_;

    my %part_dictionary;
    foreach (keys %$dictionary) {
        # split the key into midps,endpos and A.B.C
        my @keyparts = split(/:/);
        $_ = $keyparts[1];
        my @letters = split(/\./);
        $_ = $keyparts[0];
        my @positions = split(/,/);
#print STDERR "key parts `@keyparts', letters = `@letters', positions = `@positions'\n";
#exit(1);
        my $key = "1,2#".$positions[0].":$letters[0].$letters[1]";
        $part_dictionary{$key} = exists($part_dictionary{$key}) ? $part_dictionary{$key}+1 : 1;
        $key = "1,3#".$positions[1].":$letters[0].$letters[2]";
        $part_dictionary{$key} = exists($part_dictionary{$key}) ? $part_dictionary{$key}+1 : 1;
    }
#print STDERR "extended dictionary: ".Dumper(\%part_dictionary);
}

################################################################################
#
#    score_discrim_w_mer
#
#    for D-prior score
#
#    for each position for the w_mer starting there, look its count up in the
#    positive and negative dictionaries and calculate the overall score as
#    (pos count + [1]) / (pos count + [1] + neg count + [1 + m/n]) where values
#    in [] are pseudocounts to avoid false peaks for rarely appearing w-mers
#    where m = no. negative sequences, n = no. positive sequences
#
#    for triples, use $last_i_scores, the scores for $width-1, to cut repetition
#    stored in a reference to an array
#
################################################################################

sub score_discrim_w_mer {
  my ($scores,           # reference to array to contain scores for this sequence
      $seq,              # string containing sequence
      $pos_dictionary,   # for each w_mer in positive set, how often it occurs in total
                         # key is w-mer, entries each a hash: "count" relevant here
      $neg_dictionary,   # for negative set, same data as positive set
      $width,            # width of w_mer under consideration
      $N_pos,            # number of positive sequences
      $N_neg,            # number of negative sequences
      $N_pos_normalized, # number of positive sequences normalized for maximum total count
      $N_neg_normalized, # number of negative sequences normalized for maximum total count
      $min_score,        # reference to scalar: highest score calculated here
      $max_score,        # reference to scalar: lowest score calculated here
      $scale,            # if true, calculate $min_score, $max_score
      $absolute,         # if true, don't weight for number of sequences
      $triples,          # if true score spaced triples rather than w-mers
      $fixed_start,      # if true score spaced triples with start position fixed
      $last_i_scores,    # scores for $width-1 for this sequence
      $score_cache       # previous scores for triples of this width (ref to hash)
    ) = @_;

  my $L_wP1 = length($seq) - $width + 1; # L-w+1
  my ($pseudocount_enum, $pseudocount_denom) = (1, 1 + $N_neg/$N_pos);
  for (my $j = 0; $j < $L_wP1 ; $j++) {
    my $w_mer = substr($seq, $j, $width);
    my @letters = split(//,$w_mer);
    my $w_1 = @letters-1;
    my $score;
    if ($triples) {
        my $w_mer_max_score = 0;
        if ($last_i_scores) {
            # if we have scores for $w-1 initialize max as max from
            # positions $j and $j+1; $j+1 not off end of data because
            # of loop endpoint
            $w_mer_max_score = @$last_i_scores[$j];
            # if fixedstart mode, don't consider tuples starting in previous
            # scores' j+1 position
            if (!$fixed_start && @$last_i_scores[$j+1] > $w_mer_max_score) {
                $w_mer_max_score = @$last_i_scores[$j+1];
            }
        }
        # base case: for $width = 3, $j loop will go only once
        # and $w_mer_max_score will not be defined at start
        my $i = 0;
        my $k = $width-1;
            for (my $j = $i+1; $j < $width-1; $j++) {
                    my $key = ($j-$i).",".($k-$i).":$letters[$i].$letters[$j].$letters[$k]";
                    if (exists($pos_dictionary->{$key}) && exists($$score_cache{$key})) {
                        $score = $$score_cache{$key};
                    } else {
                        my $pos_total =
                            exists($pos_dictionary->{$key})?$pos_dictionary->{$key}{"count"}:0;
                        my $neg_total =
                            exists($neg_dictionary->{$key})?$neg_dictionary->{$key}{"count"}:0;
                        $score =  ($pos_total + $pseudocount_enum) / ($pos_total + $neg_total +
                            $pseudocount_denom);
                        $$score_cache{$key} = $score;
print STDERR "score $score > 1 : pos = $pos_total, neg =  $neg_total, pce = $pseudocount_enum, pcd = $pseudocount_denom \n" if ($score > 1);
                    }
                    if (! $w_mer_max_score || $score > $w_mer_max_score) {
                        $w_mer_max_score = $score;
print STDERR "score $score > 1 : pce = $pseudocount_enum, pcd = $pseudocount_denom \n" if ($score > 1);
                    }
                }
        $score = $w_mer_max_score;
    } else {
        my $pos_total = (exists($pos_dictionary->{$w_mer})?$pos_dictionary->{$w_mer}{"count"}:0);
        my $neg_total = (exists($neg_dictionary->{$w_mer})?$neg_dictionary->{$w_mer}{"count"}:0);
        # add pseudocounts to the pos_total / ($pos_total + $neg_total) score here
        $score =  ($pos_total + $pseudocount_enum) / ($pos_total + $neg_total + $pseudocount_denom);
    }
    if (1) { #($scale)
      if ((! defined($$min_score)) || $score < $$min_score) {
          $$min_score = $score;
      }
      if (! defined($$max_score) || $score > $$max_score) {
          $$max_score = $score;
      }
    }
    push(@$scores, $score);
  }
}

################################################################################
#
#    score_w_mer
#
#    for each position for the w_mer starting there, look its score up in the
#    positive and negative dictionaries and calculate the overall score  as
#    (pos count + ( #neg seqs - neg count)) / (# pos seqs + # neg seqs)
#    normalized for how long each sequence in which the w_mer was found and how
#    many sequences there are in the positive and negative sets.
#    This classifies each w-mer as to how strongly it discriminates the
#    positive sequences.
#
################################################################################

sub score_w_mer {
  my ($scores,           # reference to array to contain scores for this sequence
      $seq,              # string containing sequence
      $pos_dictionary,   # for each w_mer in positive set, how many sequences it's in
                         # key is w-mer, entries each a hash: "count" relevant here
      $neg_dictionary,   # for negative set, same data as positive set
      $width,            # width of w_mer under consideration
      $N_pos,            # number of positive sequences
      $N_neg,            # number of negative sequences
      $N_pos_normalized, # number of positive sequences normalized for maximum total count
      $N_neg_normalized, # number of negative sequences normalized for maximum total count
      $min_score,        # reference to scalar: highest score calculated here
      $max_score,        # reference to scalar: lowest score calculated here
      $scale,            # if true, calculate $min_score, $max_score
      $absolute,         # if true, don't weight for number of sequences
      $triples,          # if true score spaced triples rather than w-mers
      $fixed_start,      # if true score spaced triples with start position fixed
      $last_i_scores,    # FIXME: not used here: needed to speed up triples
      $score_cache       # FIXME: check if all OK
    ) = @_;

  my $Neg_weight = $N_pos/$N_neg;

  if ($absolute) {
    $Neg_weight = 1;
  }
  my $L = length($seq);
  for (my $j = 0; $j < $L - $width + 1; $j++) {
    my ($neg_total, $pos_total) = (0,0);
    my $w_mer = substr($seq, $j, $width);
    my @letters = split(//,$w_mer);
    my $w_1 = @letters-1;
    my $w_mer_max_score;
    my $score;

#FIXME: triples do not include all the updates and performance enhancements of
#score_discrim_w_mer##########################################################

    if ($triples) {
        my $last = $fixed_start?1:$width-2;
        for (my $i = 0; $i < $last; $i++) {
            for (my $j = $i+1; $j < $width-1; $j++) {
                for (my $k = $j+1; $k < $width; $k++) {
                    my $key = ($j-$i).",".($k-$i).":$letters[$i].$letters[$j].$letters[$k]";
                    # match for spaced triples that can start with don't cares
#                    my $key = $i.",".$j.",".$k.":$letters[$i].$letters[$j].$letters[$k]";
                    if (exists($pos_dictionary->{$key}) && exists($$score_cache{$key})) {
                        $score = $$score_cache{$key};
                    } else {
                        $pos_total = exists($pos_dictionary->{$key})?$pos_dictionary->{$key}{"count"}:0;
                        $neg_total = exists($neg_dictionary->{$key})?$neg_dictionary->{$key}{"count"}:0;
                        $score =  ($pos_total + ($N_neg_normalized-$neg_total)*$Neg_weight) /
                            ($N_pos_normalized + $N_neg_normalized*$Neg_weight);
                        $$score_cache{$key} = $score;
                    }
                    if (! defined($w_mer_max_score) || $score > $w_mer_max_score) {
                        $w_mer_max_score = $score;
                    }
                    
                }
            }
        }
    } else {
        #    in the "absolute" case reduces to
        #    $score = ($pos_total+$N_neg - $neg_total) / ($N_pos +$N_neg);
        $pos_total = exists($pos_dictionary->{$w_mer})?$pos_dictionary->{$w_mer}{"count"}:0;
        $neg_total = exists($neg_dictionary->{$w_mer})?$neg_dictionary->{$w_mer}{"count"}:0;
        $w_mer_max_score =  ($pos_total + ($N_neg_normalized-$neg_total)*$Neg_weight) /
            ($N_pos_normalized + $N_neg_normalized*$Neg_weight);

    }
    if ($scale) {
      if (! defined($$min_score) || $w_mer_max_score < $$min_score) {
          $$min_score = $w_mer_max_score;
      }
      if (! defined($$max_score) || $w_mer_max_score > $$max_score) {
          $$max_score = $w_mer_max_score;
      }
    }
    push(@$scores, $w_mer_max_score);
  }
}


################################################################################
#
#  check_numeric
#
#  check if value is  defined and numeric; type, max and min optional
#  if supplied type "int" signals check if truncated == original
#  returns 0 if tests fail, 1 otherwise
#
################################################################################

sub check_numeric {
  my ($data,    # string to check
      $type,        # if defined and "int" check that no fraction otherwise ignore
      $min,         # if defined, check that value >= $min
      $max          # if defined, check that value <= $max
  ) = @_;
  if (defined($data)) {
        if (!looks_like_number($data)) {
          return 0;
        }
        if (defined($max) && $data > $max) {
          return 0;
        }
        if (defined($min) && $data < $min) {
          return 0;
        }
        if (defined($type) && $type eq "int") {
          return int($data) == $data;
        }
        # passed all tests here:
        return 1;
  } else {
    return 0;
  }
}

################################################################################
#
#  roundup
#
#  return smallest integer >= given value
#
################################################################################

sub roundup {
    my ($n) = @_;
    return(($n == int($n)) ? $n : int($n + 1))
}


################################################################################
#
#    read_seq_file
#
#    read a sequence file and return the sequences converted to upper case plus
#    names and comments after names $sequences. If alphabet given die if letter
#    not in given alphabet string encountered.
#
################################################################################

sub read_seq_file {
  my ($infilename,   # string containing path to file
      $sequences,    # reference to array to contain the result (uppercased)
      $total_length, # reference to scalar to contain length of new sequence
      $verbose,      # if true print stats to STDERR
      $alphabet,     # if defined used to check for valid letters
      $equiv_groups  # convert groups of letters into the first letter
  ) = @_;
  $$total_length = 0;
  if ($verbose) { print STDERR "$infilename: "; }
  open(INFILE, "<$infilename") or die ("failed to open $infilename\n");
  my $firsttime = 1; # tell read_fasta_sequence new file started

  # create a translator for the equiv groups
  my $translator;
  if (defined($equiv_groups) && scalar(@{$equiv_groups}) > 0) {
    my $tr_in = '';
    my $tr_out = '';
    foreach my $equiv_group (@$equiv_groups) {
      next unless (length($equiv_group) >= 2);
      my $group_in = substr($equiv_group, 1);
      my $group_out = substr($equiv_group, 0, 1) x length($group_in);
      $tr_in .= $group_in;
      $tr_out .= $group_out;
    }
    if (length($tr_in) > 0) {
      $translator = eval "sub { tr/\Q$tr_in\E/\Q$tr_out\E/ }" or die $@;
    }
  }


  while (my @seq = read_fasta_sequence(\*INFILE, $firsttime)) {
    $firsttime = 0; # tell read_fasta_sequence file not new anymore
    
    if (defined $alphabet) {
      # check sequence
      die("sequence data `$seq[$SEQPOS]' not from alphabet `".
          $alphabet->name()."'") if ($alphabet->find_unknown($seq[$SEQPOS]));
      # convert aliases to their core symbol
      $seq[$SEQPOS] = $alphabet->translate_seq($seq[$SEQPOS], NO_ALIASES => 1);
    }
    &$translator($seq[$SEQPOS]) if (defined $translator);
    
    push (@$sequences, \@seq);
    # seq[0] is name, $seq[1] is sequence, $seq[2] is comment
    $$total_length += length($seq[$SEQPOS]);
  }
  die "no sequence data in $infilename" unless $$total_length > 0;
  close INFILE or die "failed to close $infilename";
  if ($verbose) { print STDERR "$$total_length bases or amino acids\n"; }
}


################################################################################
#
#               list_w_mers
#
#       return all unique w_mers in a set of sequences as a list
#               optionally including reverse complements
#
################################################################################

sub list_w_mers {
  my ($sequences, $alph, $revcomp, $width) = @_;
  my %dictionary;
  my $N = @$sequences;
  my @w_mers;
  foreach my $seq (@$sequences) {
    my $reverse;
    if ($revcomp) {
      $reverse = $alph->rc_seq($seq);
    }
    my $M = length($seq);
    for (my $j = 0; $j < $M-$width+1; $j++) {
      $dictionary{substr($seq, $j, $width)} = 1;
      if ($revcomp) {
        $dictionary{substr($reverse, $j, $width)} = 1;
      }
    }
  }
  for my $name (keys %dictionary) {
    push (@w_mers, $name);
  }
  return \@w_mers;
}

################################################################################
#
#       W-mer to PSPM
#
#  convert a w-mer to a PSP in MEME format,
#
################################################################################
sub wmer2pspm {
  my $letters;
  ($_, $letters) = @_;
  my @bases = split(//);
  my @matrix;
  my $lastposition = @$letters[scalar@{$letters}-1];
  foreach my $base (@bases) {
    my $line = "";
    foreach my $position (@$letters) {
      #print "considering `$base' at `$position'\n";
      $line .= (uc($base) eq $position)?1:0;
      $line .= ($position eq $lastposition)?"":" ";
    }
    push(@matrix,$line);
  }
  return \@matrix;
}

################################################################################
#  read_prior_file
#
#  Read a PRIOR file and return the values in an
#  associative array indexed by sequence name (based on read_fasta_file).
#  Values in the associative array are in a reference to an anonymous array.
#  Does NOT check validity of data to allow use for a range of prior types.
#  Only assumes a FASTA-like name line starting with >name followed by a
#  sequence of values that may contain spaces (unlike read_fasta_file, which
#  edits out spaces)
#
#  if $keepcomments is true then return a hash containing references
#  to arrays of references to arrays of PSP, comments (from end of name line)
#
#  $file    open file pointer
#
################################################################################
sub read_prior_file {
  my ($file, $keepcomments) = @_;

  open(F, "<$file") || die("Can't open file $file\n");
  
  my ($name, $seq, $comments) = ("","","");
  my %seqs;

  # read the file
  while (<F>) {
    if (/^>(\S+)\s+(\S.*)$/) {        # found start of sequence
      if ($name) {        # save previous sequence in array
          $_ = $seq;
          if ($keepcomments) {
              $seqs{$name} = [[split], $comments];
          } else {
              $seqs{$name} = [split];
          }
      }
      $name = $1;        # save sequence name
      $comments = $2;
      $seq = "";
    } else {
      $seq .= $_;
    }
  } # read file
  if ($name && $seq) {        # save very last sequence in array
    $_ = $seq;
    if ($keepcomments) {
        $seqs{$name} = [[split], $comments];
    } else {
        $seqs{$name} = [split];
    }
  }

  %seqs;          # return associative array of names and arrays of values
} # read_prior_file

1;
