########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
#       Authors: Diane Trout, Benjamin Bornstein, Lucas Scharenbroich
# 
# Last Modified: Aug. 9, 2001
#
#

"""
Usage: <MCCV cannot be invoked directly from the command line>

 Wrapper for MCCV (Monte-Carlo Cross Validation).

 Brief Algorithm Description:

 Required Parameters:  (note: the list enclosed in the brakets are possible
                              values each one of parameters can take )

         mccv_parameter_name     = 'x'

                 The parameters name to iterate over when performing
                 MCCV.  Typically 'k' is chosen.

         mccv_parameter_values   = [list]

                 A list of values to substitue for mccv_parameter_name.
                 Since the parameters files are executed as python code,
                 an easy way to perform MCCV over a sequence is to do
                 something similar to mccv_parameter_values = range(10)

         mccv_test_fraction      = x

                 x is a value between 0.0 and 1.0 which indicates the
                 percentage of the dataset to use for testing.

         mccv_num_trials         = x

                 Number of times to run each algorithm for a particular
                 value in mccv_parameter_values.  An algorithm will be
                 run a total of mccv_num_trials * len(mccv_parameter_values)
                 times

 Optional / Dependent Parameters:

         seed                    = x

                 The seed to use for the pseudo-random number generator
                 when randomly partitioning the training set.  Defaults to 42.

         mccv_fitness            = filename / or 'yes'

                 The mccv_fitness specifies a file to save the
                 fitness table computed by MCCV. Without any directory
                 information the default directory will be the same
                 as the results file, any provided directory information
                 will completely override the save location.

                 if yes, the output files basename is used, but an
                 extenstion of .fit is used

         mccv_save_state         = filename

                 The mccv_save_state option specifies a file to save
                 a pickled version of the algorithm classes run by mccv.
                 Like mccv_fitness it defaults to the results directory
                 when the directory portion of the path is not specified.
"""

import random
import Numeric
import sys

from types import *
from compClust.util import NaN
from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream

from compClust.mlx.ML_Algorithm import ML_Composable_Algorithm

import compClust.util.WrapperParameters as wp

class Parameters(wp.WrapperParameters):
  _params = [
    wp.StrProperty('parameter_name',
                   doc='The parameter of the sub algorithm to vary over.',
                   priority=wp.Priority.REQUIRED,),
    wp.ListProperty('parameter_values',
                   doc='Set of values for the variable parameter.',
                   priority=wp.Priority.REQUIRED,),
    wp.FloatProperty('test_fraction', 0.5, 0.0, 1.0,
                     doc="""
x is a value between 0.0 and 1.0 which indicates the percentage of the
dataset to use for testing.""",
                     priority=wp.Priority.REQUIRED,),
    wp.IntProperty('num_trials', 100, min=0,
                   doc='The number of trials to run for each parameter value',
                   priority=wp.Priority.REQUIRED,),
    wp.IntProperty('seed', 42,
                   doc="""
Set the starting seed for the random number generator, defaults to 42.""",
                   priority=wp.Priority.OPTIONAL),
    wp.StrProperty('fitness', 
                   doc="""
The mccv_fitness specifies a file to save the fitness table
computed by mccv. Without any directory information the default
directory will be the same as the results file, any provided directory
information will completely override the save location.""",
                   priority=wp.Priority.EXPERIMENTAL),
    wp.StrProperty('save_state', 
                   doc="""
The mccv_save_state option specifies a file to save a pickled
version of the algorithm classes run by mccv.  Like
mccv_fitness it defaults to the results directory when the
directory portion of the path is not specified.""",
                   priority=wp.Priority.EXPERIMENTAL),
    wp.StrProperty('save_intermediate_files', "no",
                   doc="""
Save intermediate files created by the sub-algorithm being run""",
                   priority=wp.Priority.EXPERIMENTAL),
    wp.StrProperty('results_dir', 
                   priority=wp.Priority.INTERNAL),
    ]

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%M: MCCV: ")
DEBUG = 0

#
# Meta-wrapper which implements Monte-Carlo Cross Validation
#

class MCCV(ML_Composable_Algorithm):

  PARTITION_TRAIN  = '0'
  PARTITION_TEST   = '1'

  def __init__(self, dataset=None, parameters=None, algorithm=None):
    """MCCV(dataset=None, parameters=None, algorithm=None)
    """

    #
    # Initialize variables from the parameters
    #
      
    self.algorithm = algorithm
    self.parameters = Parameters(parameters)
    #self.setParameters(parameters)
    self.setDataset(dataset)

    #
    # Nothing has been made yet, so everything is None
    #

    self.best_run           = 0
    self.fitness_tbl        = None

  def copy(self):

    return MCCV(self.getDataset(), self.getParameters(), self.algorithm)


  def clear(self):

    ML_Algorithm.clear(self)
    if self.algorithm is not None:
      self.algorithm.clear()

  #def setParameters(self, parameters):
  #  """setParameters(parameters):
  #
  #  Overloaded method to force the algorithm to have at least a bare
  #  amount of parameters.
  #  """
  #
  #  dp = self.getDefaultParameters()
  #  if parameters is not None:
  #    dp.update(parameters)
  #
  #  self.parameters = dp
  #
  #
  #def getDefaultParameters(self):
  #  """getDefaultParameters():
  #
  #  returns a set of sensible parameters from running MCCV when the
  #  caller has no other information.  Also used from within the
  #  constructor
  #  """
  #
  #  parms = {}
  #
  #  parms[ "mccv_parameter_name"  ] = 'k'
  #  parms[ "mccv_parameter_values"] = range(1,10)
  #  parms[ "mccv_test_fraction"   ] = 0.5
  #  parms[ "mccv_num_trials"      ] = 10
  #
  #  return parms

  
  def getLabeling(self):
    """Labeling = MCCV.getLabeling()
    """

    if not self.best_run:
      self.constructBestClustering()

    return  self.algorithm.getLabeling()

  
  def getModel(self):
    """Model = MCCV.getModel()
    """
    if not self.best_run:
      self.constructBestClustering()

    return self.algorithm.getModel()


  def validate(self):
    """validate()

    Ensures that all parameters and environment variables nescessary
    to run the clustering algorithm (MCCV) are defined.
    """

    from compClust.util import Verify

    parameter_names   = [ "parameter_name",
                          "parameter_values",
                          "test_fraction",
                          "num_trials" ]

    fail = 0
    
    if Verify.parameters_exist( parameter_names, self.parameters ):
      fail = 1

    #
    # Take care of default parameters
    #
    
    if not self.parameters.has_key("seed"):
      self.parameters.seed = 42

    if not self.parameters.has_key("save_intermediate_files"):
      self.parameters.save_intermediate_files = "no"

    return not fail


  def writeSpecialFiles(self):
    """
    writeSpecialFiles(self)

    After the MCCV loop has completed there may be auxiliary information that
    should be written out as well.  This function does that.  Currently, only
    the fitness table and a pickle.dump on the object are done.
    """

    import cPickle as pickle
    import os.path

    #
    # Check to see if we should dump the object state
    #
    save_file = self.parameters['save_state']
    if save_file  is not None and len(save_file) > 0:
      #
      # If it is not a full path, place it in the results directory
      #
      
      if len( os.path.split( save_file )[0] ) == 0:
        save_file = self.parameters['results_dir'] + os.sep + save_file
        
      stream = open(save_file, "w")

      #
      # Save the MCCV state
      #
    
      pickle.dump(self, stream)
      stream.close()

    #
    # Check for outputting fitness file
    #
    fitness_file = self.parameters['fitness']
    if fitness_file is not None and len(fitness_file) > 0:

      if fitness_file == "yes":
        # FIXME: this was previously self.parameters[''] which doesn't
        # FIXME: seem like it should have any chance of working
        # FIXME: this should at least save a file, though i have no idea
        # FIXME: if it's the right thing to do
        file = self.parameters['results_dir'] + os.sep + 'fitness'
        
      else:
        if len( os.path.split(file)[0] ) == 0:
          file = self.parameters['results_dir'] + os.sep + file
          
      stream = open(file, "w")

      #
      # Save the fitness table
      #
      
      fitness_table = self.getFitnessTable()
      best_param    = repr(self.getBestParam())
      
      self.writeFitnessTable(stream, fitness_table, best_param)
        
      stream.close()


  def run(self):
    """value = run()
    """

    import compClust.mlx.wrapper
    from compClust.mlx.datasets import Dataset
    
    #
    # Starting a new MCCV run, so the best_run is invalidated, also clear
    # and labeling the dataset may have.  Note that self.algorithm.labeling
    # may still reference this Labeling object, but that should go away when
    # the algorithm is .run().
    #

    self.best_run = 0
    algorithm  = self.algorithm

    #
    # Copy parameters in case the wrapper does something strange
    #
    
    parameters = self.algorithm.getParameters().copy()
    
    labeling = algorithm.getLabeling()
    if labeling is not None:
      self.dataset.removeLabeling(labeling)

    #
    # Initialize the seed for the random number generator
    #

    if parameters.has_key("seed"):
      random.seed(parameters["seed"])

    #
    # get the values from the parameters
    #

    mccv_variable = self.parameters.parameter_name
    mccv_values   = self.parameters.parameter_values

    num_trials    = self.parameters.num_trials
    num_values    = len( mccv_values )

    test_fraction = self.parameters.test_fraction

    #
    # Initialize variable for the MCCV loop
    #

    fitness_scores = []
    
    #
    # Try to get the transformed data, if there is none or an error, use
    # the non-transformed data passed in.  The transformed data is needed for
    # scoring the fitness.  Be sure that there are actually parameters present
    # in the algorithm.
    #

    algorithm.setParameters( parameters )

    xformed_dataset = algorithm.getTransformedDataset( self.dataset )
    if xformed_dataset is None:
      xformed_dataset = self.dataset

    #
    # NOTE: At this point (for some wrappers) the parameters may have been
    # .copy()ed. So we need to do a .getParameters() before operating on
    # the parameters hash.
    #

    parameters = self.algorithm.getParameters();

    #
    # Start the MCCV loop
    #

    for trial_index in range(num_trials):

      #
      # Create a partition of the data ...
      #
      
      train_partition = self.createRandomPartition(1.0 - test_fraction)
      test_partition  = self.createInversePartition( train_partition )

      #
      # ... and set the algorithm's dataset
      #

      
      currentDataset = Dataset(Numeric.take(self.dataset.getData(),
                                            train_partition))

      #
      # Now run over all the MCCV values
      # 

      for value_index in range(num_values):
        
        error = 0
        
        #
        # Create a copy of the parameters local to the algorithm
        # being run at this point
        #

        parameters[mccv_variable] = mccv_values[value_index]
        algorithm.setParameters( parameters )
        algorithm.setDataset( currentDataset )
        
        #
        # Make sure the algorithm is able to be .run()
        #

        if not algorithm.validate():
          error = 1
          MESSAGE_STREAM.write("validation in run() failed.\n")

        #
        # Run the algorithm and check for an error message

        if not error:
          status = algorithm.run()
          if status == compClust.mlx.wrapper.WRAPPER_STATUS_ERROR:
            error = 1
            MESSAGE_STREAM.write("run terminated with error.\n")

        #
        # If the .run() was Ok, then try to get the model created for
        # the training data

        if not error:
          model = algorithm.getModel()
          if model is None:
            error = 1
            MESSAGE_STREAM.write("(trial = " + str(trial_index) + ", " + \
                                 mccv_variable + " = " + \
                                 str(mccv_values[value_index]) + \
                                 ") produced no Model!\n")

        #
        # get the models fitness given the test data.
        #
        # Hopefully we have a valid model, if not, then there can be no
        # fitness score.  The _only_ time a None model should be returned
        # is when k_strict or some other highly restrictive parameter is set.
        # The general rule is that if the clustering algorithms succeeds then
        # one should expect to be able to get some valid fitness score
        #
        # A model should also be None if the clustering algorithm failed
        # at the C code level (i.e. segfault or other crash)
        #

        if not error:
          test_data = Numeric.take(xformed_dataset.getData(), test_partition)
          score = model.evaluateFitness( test_data )
          test_data = None
          
          if NaN.isNaN(score):
            error = 1
            
        #
        # If we have a valid score for this run, create a tuple of relevant
        # statistics.  For now we track
        #
        #   k     - number of clusters specified
        #   k'    - actual clusters returned
        #   score - fitness score of the test data on the trained model

        if not error:
          k       = mccv_values[value_index]
          k_prime = len(algorithm.getLabeling().getLabels())
          fitness_scores.append([k, k_prime, score])

        #
        # Clear algorithm for another iteration
        #

        algorithm.clear()
        model = None
        
      #
      # Remove the training set
      #
      
      currentDataset = None

    #
    # The MCCV loop is completed, so now remove the transformed data and
    # finish up
    #

    xformed_dataset = None
    self.fitness_tbl = fitness_scores

    # Restore the paramters from our copy
    algorithm.setParameters(parameters)
    
    #
    # save the intermediate information if requested (fitness file and
    # MCCV state)
    #

    self.writeSpecialFiles()

    return compClust.mlx.wrapper.WRAPPER_STATUS_DONE
  
        
  def getFitnessTable(self):

    return self.fitness_tbl


  def __computeFitnessAvg(self, fitnessTbl):
    """
    fitness list = __computeFitnessAvg(fitnessTbl)

    Compute fitness average will run through the fitness tuples and compute
    the average score of all k_prime entries.  If a score of NaN is found,
    it is not considered.  If the sum of a series of k_primes is zero, then
    that average is not considered.  A list of tuples [k_prime, avg] is
    returned.
    """

    from compClust.util.unique import unique
    
    avgList   = []

    if fitnessTbl is not None:
      
      #
      # Remove any fitnesses which are NaN
      #
      
      shortTbl = filter(lambda x : not NaN.isNaN(x[2]), fitnessTbl)
      
      #
      # get the list of all unique k_prime
      #
      
      k_primes = unique(map(lambda x : x[1], shortTbl))

      #
      # Now get the list of fitnesses for each k_prime.
      #

      for k_prime in k_primes:
        vals = Numeric.array(filter(lambda x : (x[1] == k_prime), shortTbl));
        vals = vals[:,2];

        if len(vals) == 0:
          continue
        avg = Numeric.sum(vals) / len(vals)
        avgList.append([avg, k_prime])
      
    #
    # return the list
    #

    return avgList

    
  def getBestParam(self):

    #  
    # Compute likelihood average
    #

    fitness_avg_list = self.__computeFitnessAvg(self.getFitnessTable())

    #
    # find value with max fitness

    fitness_avg_list.sort()
    fitness_avg_list.reverse()

    try:
      best_param = fitness_avg_list[0][1]
    except:
      best_param = self.parameters.parameter_values[0]
      
    return best_param


  def constructBestClustering(self):
    """MCCV.constructBestClustering()

    Once an MCCV run is complete, this function returns the best
    clustering based on the parameters discovered.
    """

    #
    # A complete run needs to be done here since during training, only a
    # fraction of the dataset has been used.  constructBestClustering() will
    # return an algorithm run over the whole dataset
    #

    parameters = self.algorithm.getParameters()

    #
    # Set the parameters to the one we determines as best during the
    # MCCV iterations
    #

    parameters[self.parameters.parameter_name] = self.getBestParam()

    self.algorithm.setParameters(parameters)
    self.algorithm.setDataset( self.getDataset() )

    #
    # Do the run to partition the dataset
    #

    if (self.algorithm.validate()):
      self.algorithm.run()

    #
    # Remove the dataset from the current algorithm, but keep the model.  Since
    # we passed a reference to self.dataset(), the labeling comes along for
    # free.

    self.algorithm.setDataset(None)
    self.algorithm.setParameters(None)
    
    #
    # Flag that we've completed the best run

    self.best_run = 1
    

  def createInversePartition(self, partition):
    """
    createInversParitions()

    let U be the set of all row numbers for the current datset
    let A be the partition set

    return U - A = A'
    """
    
    rows = range( self.getDataset().getNumRows() )
    return filter(lambda x : x not in partition, rows)

  
  def createRandomPartition(self, fraction):
    """
    createRandomParitions()

    Creates a list of rows which define a random subset of the dataset
    """
    
    nrows         = self.getDataset().getNumRows()
    mark_nrows    = int( ( nrows * fraction ) + 0.5 )

    #
    # Create a sequence of ones and zeros and then use it to 
    # mask a list of the rows.
    #
    
    seq  = self.createRandomMarkSequence(nrows, mark_nrows)
    rows = range(len(seq))

    partition = filter(None, map(lambda x,y: x*y, seq, rows))
    if seq[0] == 1:
      partition.insert(0,0)
    
    return partition

  
  def createRandomMarkSequence(self, total_size, mark_size):
    """seq = createRandomMarkSequence(total_size, mark_size)
    
    Creates a sequence containing total_size integers, either 0 or 1.
    Marks mark_size randomly chosen elements of the sequence as 1 and
    the remaining elements as 0.
    
    This method is implemented efficiently, as it never needs to
    visit more than half the elements in the sequence.

    Implementation based on da_random.c:random_mark_vector() by Joe
    Roden and Alex Gray.
    """
    
    total_size_minus_one = total_size - 1
    
    #
    # This method is implemented efficiently, as it never needs to visit
    # more than half the elements in the sequence, which also has the
    # effect of reducing collisions (choosing already marked elements)
    # and restarts (re-choosing an index) in the while / if statements
    # below.
    #
    
    #
    # If less-than half of the elements of seq are to be marked:
    #   1. Initalize seq to all 0s.
    #   2. Mark mark_size elements as 1.
    #

    if (mark_size < (total_size / 2)):
      seq = [0] * total_size

      for i in range(mark_size):
        while (1):
          index = random.randrange(0, total_size_minus_one)
          if (seq[index] == 0):
            seq[index] = 1
            break

    #
    # Otherwise:
    #   1. Initialize seq to all 1s.
    #   2. (Un)mark (total_size - mark_size) the elments as 0.
    #

    else:
      seq = [1] * total_size

      for i in range( total_size - mark_size ):
        while(1):
          index = random.randrange(0, total_size_minus_one)
          if (seq[index] == 1):
            seq[index] = 0
            break

    return seq


  def writeFitnessTable(self, stream, array, best_param):
    """
    writeFitnessTable(stream, label, array, best_param)
    
    write the fitness table tuples out with a header that indicated the best
    parameter.
    """
    
    stream.write("best_param\t%s\n" % best_param)

    for i in range(len(array)):
      tuple = array[i]
      if len(tuple) > 0:
        stream.write(str(tuple[0]))
        for item in tuple[1:]:
          stream.write("\t%s" % (str(item)))
      stream.write("\n")
      
if __name__ == "__main__":
  import compClust.mlx.wrapper.Launcher
  import sys
  
  compClust.mlx.wrapper.Launcher.main(sys.argv + ['--h'], MCCV())