########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
#       Authors: Chris Hart,  Diane Trout, Benjamin Bornstein, Lucas Scharenbroich
# 

"""
Usage: <multiRun cannot be invoked directly from the command line>

 Wrapper for multiRun (Monte-Carlo Cross Validation).

 Brief Algorithm Description:

 Required Parameters:  (note: the list enclosed in the brakets are possible
                              values each one of parameters can take )

         multiRun_parameter_name     = 'x'

                 The parameters name to iterate over when performing
                 multiRun.  Typically 'k' is chosen.

         multiRun_parameter_values   = [list]

                 A list of values to substitue for multiRun_parameter_name.
                 Since the parameters files are executed as python code,
                 an easy way to perform multiRun over a sequence is to do
                 something similar to multiRun_parameter_values = range(10)

         multiRun_num_trials         = x

                 Number of times to run each algorithm for a particular
                 value in multiRun_parameter_values.  An algorithm will be
                 run a total of multiRun_num_trials * len(multiRun_parameter_values)
                 times

 Optional / Dependent Parameters:

         multiRun_fitness            = filename / or 'yes'

                 The multiRun_fitness specifies a file to save the
                 fitness table computed by multiRun. Without any directory
                 information the default directory will be the same
                 as the results file, any provided directory information
                 will completely override the save location.

                 if yes, the output files basename is used, but an
                 extenstion of .fit is used

         multiRun_save_state         = filename

                 The multiRun_save_state option specifies a file to save
                 a pickled version of the algorithm classes run by multiRun.
                 Like multiRun_fitness it defaults to the results directory
                 when the directory portion of the path is not specified.
"""
from __future__ import nested_scopes

import random
import Numeric
import sys
import types

from types import *
from compClust.util import NaN
from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream
from compClust.util import listOps
from compClust.util.CCWarnings import DebugWarning, warn
from compClust.util.WrapperParameterDescription import WrapperParameterDescription as WPD

from compClust.mlx.ML_Algorithm import ML_Composable_Algorithm

import compClust.util.WrapperParameters as wp

class Parameters(wp.WrapperParameters):
  _params = [
    wp.StrProperty('parameter_name',
                   doc='The parameter of the sub algorithm to vary over.',
                   priority=wp.Priority.REQUIRED,),
    wp.ListProperty('parameter_values',
                   doc='Set of values for the variable parameter.',
                   priority=wp.Priority.REQUIRED,),
    wp.IntProperty('num_trials', 100, min=0,
                   doc='The number of trials to run for each parameter value',
                   priority=wp.Priority.REQUIRED,),
    wp.IntProperty('seed', 42,
                   doc="""
Set the starting seed for the random number generator, defaults to 42.""",
                   priority=wp.Priority.OPTIONAL),
    wp.StrProperty('fitness', 
                   doc="""
The multiRun_fitness specifies a file to save the fitness table
computed by multiRun. Without any directory information the default
directory will be the same as the results file, any provided directory
information will completely override the save location.""",
                   priority=wp.Priority.EXPERIMENTAL),
    wp.StrProperty('save_state', 
                   doc="""
The multiRun_save_state option specifies a file to save a pickled
version of the algorithm classes run by multiRun.  Like
multiRun_fitness it defaults to the results directory when the
directory portion of the path is not specified.""",
                   priority=wp.Priority.EXPERIMENTAL),
    wp.StrProperty('save_intermediate_files', "no",
                   doc="""
Save intermediate files created by the sub-algorithm being run""",
                   priority=wp.Priority.EXPERIMENTAL),
    ]

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%M: multiRun: ")
DEBUG = 0

#
# Meta-wrapper which implements Monte-Carlo Cross Validation
#

class MultiRun(ML_Composable_Algorithm):

  def __init__(self, dataset=None, parameters=None, algorithm=None):
    """multiRun(dataset=None, parameters=None, algorithm=None)
    """

    #
    # Initialize variables from the parameters
    #
      
    self.algorithm = algorithm
    self.parameters = Parameters(parameters)
    self.setDataset(dataset)

    #
    # Nothing has been made yet, so everything is None
    #

    self.best_run           = 0
    self.fitness_tbl        = None
    
    self._sub_parameters = {}

  def copy(self):

    return multiRun(self.getDataset(), self.getParameters(), self.algorithm)

  def clear(self):

    super(MultiRun, self).clear()
    if self.algorithm is not None:
      self.algorithm.clear()
  
  def setParameters(self, parameters):
    """setParameters(parameters):

    Overloaded method to force the algorithm to have at least a bare
    amount of parameters.
    """

    dp = self.getDefaultParameters()
    if parameters is not None:
      dp.update(parameters)

    self.parameters.setParametersDictionary(dp)

  def getDefaultParameters(self):
    """getDefaultParameters():

    returns a set of sensible parameters from running multiRun when the
    caller has no other information.  Also used from within the
    constructor
    """

    parms = {}

    parms[ "parameter_name"  ] = 'seed'
    parms[ "parameter_values"] = [random.randrange (0,10000) for x in range(10)] 
    parms[ "num_trials"      ] = 10

    return parms
  
  def getLabeling(self):
    """Labeling = multiRun.getLabeling()
    
    :parameters: 
      - `autorun`: wether or not to automatically run constructBestClustering
    """

    if not self.best_run:
      warn("construcing best clustering", DebugWarning)
      self.constructBestClustering()
    warn("getLabeling getting itself", DebugWarning)
    return  self.algorithm.getLabeling()
  
  def getModel(self):
    """Model = multiRun.getModel()
    """
    if not self.best_run:
      self.constructBestClustering()

    return self.algorithm.getModel()
    
  def validate(self):
    """validate()

    Ensures that all parameters and environment variables nescessary
    to run the clustering algorithm (multiRun) are defined.
    """

    from compClust.util import Verify

    parameter_names   = [ "parameter_name",
                          "parameter_values",
                          "num_trials" ]

    fail = 0
    
    if Verify.parameters_exist( parameter_names, self.parameters ):
      fail = 1

    #
    # Take care of default parameters
    #
    
    if not self.parameters.has_key("seed"):
      self.parameters["seed"] = 42

    if not self.parameters.has_key("save_intermediate_files"):
      self.parameters["save_intermediate_files"] = "no"

    return not fail

  def writeSpecialFiles(self):
    """
    writeSpecialFiles(self)

    After the multiRun loop has completed there may be auxiliary information that
    should be written out as well.  This function does that.  Currently, only
    the fitness table and a pickle.dump on the object are done.
    """

    import cPickle as pickle
    import os.path

    #
    # Check to see if we should dump the object state
    #

    if self.parameters.has_key('multiRun_save_state'):

      file = self.parameters['multiRun_save_state']

      #
      # If it is not a full path, place it in the results directory
      #
      
      if len( os.path.split( file )[0] ) == 0:
        file = self.parameters['results_dir'] + os.sep + file
        
      stream = open(file, "w")

      #
      # Save the multiRun state
      #
    
      pickle.dump(self, stream)
      stream.close()

    #
    # Check for outputting fitness file
    #
    
    if self.parameters.has_key('multiRun_fitness'):
      file = self.parameters['multiRun_fitness']

      if file == "yes":
        file = self.parameters['']
        
      else:
        if len( os.path.split(file)[0] ) == 0:
          file = self.parameters['results_dir'] + os.sep + file
          
      stream = open(file, "w")

      #
      # Save the fitness table
      #
      
      fitness_table = self.getFitnessTable()
      best_param    = repr(self.getBestParam())
      
      self.writeFitnessTable(stream, fitness_table, best_param)
        
      stream.close()

  def run(self):
    """value = run()
    """
    
    import compClust.mlx.wrapper
    from compClust.mlx.datasets import Dataset
    
    #
    # Starting a new multiRun run, so the best_run is invalidated, also
    # clear and labeling the dataset may have.  Note that
    # self.algorithm.labeling may still reference this Labeling object,
    # but that should go away when the algorithm is .run().
    #

    self.best_run = 0
    algorithm  = self.algorithm

    #
    # Copy parameters in case the wrapper does something strange
    #
    parameters_dictionary_copy = algorithm.parameters.getParametersDictionary().copy()
    dataset = self.getDataset()
    
    
    if self.algorithm.hasLabeling():
      labeling = algorithm.getLabeling()
      self.dataset.removeLabeling(labeling)
    #
    # Initialize the seed for the random number generator
    #

    if algorithm.parameters.has_key("seed"):
      random.seed(algorithm.parameters["seed"])

    #
    # get the values from the MultiRun parameters
    #

    multiRun_variable = self.parameters.parameter_name
    multiRun_values   = self.parameters.parameter_values
    
    num_trials    = 1 
    num_values    = len( multiRun_values )

    #
    # Initialize variable for the multiRun loop
    #

    fitness_scores = []
    
    #
    # Try to get the transformed data, if there is none or an error, use
    # the non-transformed data passed in.  The transformed data is needed
    # for scoring the fitness.  Be sure that there are actually
    # parameters present in the algorithm.
    #
    
    #algorithm.setParameters( parameters )
    
    xformed_dataset = algorithm.getTransformedDataset( self.dataset )
    if xformed_dataset is None:
      xformed_dataset = self.dataset

    #
    # NOTE: At this point (for some wrappers) the parameters may have
    # been .copy()ed. So we need to do a .getParameters() before
    # operating on the parameters hash.
    #
    
    parameters_dictionary = algorithm.parameters.getParametersDictionary()

    #
    # Start the multiRun loop
    #
    warn("multirun %s starting loop" % self.parameters['parameter_name'], DebugWarning)
    for trial_index in range(num_trials):

      currentDataset = Dataset(self.getDataset())
      #
      # Now run over all the multiRun values
      # 

      for value_index in range(num_values):
        
        error = 0
        
        #
        # Create a copy of the parameters local to the algorithm
        # being run at this point
        #
        
        algorithm.parameters.setParametersDictionary(parameters_dictionary)
        # if we're still working with composable algorithms, just set the sub parameter
        # and continue
        if isinstance(algorithm, ML_Composable_Algorithm):
          algorithm._setSubParameter(multiRun_variable, multiRun_values[value_index])
        else:
          algorithm.parameters[multiRun_variable] = multiRun_values[value_index]
          # also check to see if we have any subparameters and set them too
          if len(self._sub_parameters) > 0:
            for k,v in self._sub_parameters.items():
              algorithm.parameters[k] = v
        algorithm.setDataset( currentDataset )
        
        #
        # Make sure the algorithm is able to be .run()
        #

        if not algorithm.validate():
          error = 1
          MESSAGE_STREAM.write("validation in run() failed.\n")

        #
        # Run the algorithm and check for an error message

        if not error:
          warn("multirun sweep %s=%s"%(self.parameters['parameter_name'], multiRun_values[value_index]), DebugWarning)
          status = algorithm.run()
          if status == compClust.mlx.wrapper.WRAPPER_STATUS_ERROR:
            error = 1
            MESSAGE_STREAM.write("run terminated with error.\n")

        #
        # If the .run() was Ok, then try to get the model created for
        # the training data

        if not error:
          model = algorithm.getModel()
          if model is None:
            error = 1
            MESSAGE_STREAM.write("(trial = " + str(trial_index) + ", " + \
                                 multiRun_variable + " = " + \
                                 str(multiRun_values[value_index]) + \
                                 ") produced no Model!\n")

        #
        # get the models fitness given the test data.
        #
        # Hopefully we have a valid model, if not, then there can be no
        # fitness score.  The _only_ time a None model should be returned
        # is when k_strict or some other highly restrictive parameter is
        # set.  The general rule is that if the clustering algorithms
        # succeeds then one should expect to be able to get some valid
        # fitness score
        #
        # A model should also be None if the clustering algorithm failed
        # at the C code level (i.e. segfault or other crash)
        #

        if not error:
          test_data = xformed_dataset.getData()
          score = model.evaluateFitness( test_data )
          test_data = None
          
          if NaN.isNaN(score):
            error = 1
            
        #
        # If we have a valid score for this run, create a tuple of
        # the parameter value and its fitness score

        if not error:
          value       = multiRun_values[value_index]
          fitness_scores.append([value, score])

        #
        # Clear algorithm for another iteration
        #

        algorithm.clear()
        model = None
        
      #
      # Remove the training set
      #
      
      currentDataset = None

    #
    # The multiRun loop is completed, so now remove the transformed data
    # and finish up
    #

    xformed_dataset = None
    self.fitness_tbl = fitness_scores

    # Restore the paramters from our copy
    algorithm.parameters.setParametersDictionary(parameters_dictionary_copy)
    
    # restore our dataset point to what we were set to when we started running
    self.setDataset(dataset)
    #
    # save the intermediate information if requested (fitness file and
    # multiRun state)
    #

    self.writeSpecialFiles()
    warn("done with run %s" %(self.parameters['parameter_name']), DebugWarning)
    return compClust.mlx.wrapper.WRAPPER_STATUS_DONE
  
  def getFitnessTable(self):

    return self.fitness_tbl
    
  def getBestParam(self):
    fitness_table = self.getFitnessTable()
    if fitness_table is None:
      return None
    return listOps.sort([[x[1],x[0]]for x in  fitness_table])[-1][1]

  def constructBestClustering(self):
    """multiRun.constructBestClustering()

    Once an multiRun run is complete, this function returns the best
    clustering based on the parameters discovered.
    """

    #
    # A complete run needs to be done here since during training, only a
    # fraction of the dataset has been used.  constructBestClustering() will
    # return an algorithm run over the whole dataset
    #

    name = self.parameters['parameter_name']
    warn("starting best clustering %s=%s" % (name, self.getBestParam()), DebugWarning)
    #
    # Set the parameters to the one we determines as best during the
    # multiRun iterations
    #
    
    # if we're still working with composable algorithms, just set the sub parameter
    # and continue
    if isinstance(self.algorithm, ML_Composable_Algorithm):
      self.algorithm._setSubParameter(self.parameters.parameter_name, self.getBestParam())
    else:
      self.algorithm.parameters[self.parameters.parameter_name] = self.getBestParam()
      # also check to see if we have any subparameters and set them too
      if len(self._sub_parameters) > 0:
        for k,v in self._sub_parameters.items():
          self.algorithm.parameters[k] = v
    
    #parameters[self.parameters.parameter_name] = self.getBestParam()
    

    #self.algorithm.setParameters(parameters)
    self.algorithm.setDataset( self.getDataset() )

    #
    # Do the run to partition the dataset
    #

    if (self.algorithm.validate()):
      # if 
      if isinstance(self.algorithm, ML_Composable_Algorithm):
        self.algorithm.constructBestClustering()
      else:
        self.algorithm.run()
    else:
      raise RuntimeError("MultiRun algorithm validation failed")

    #
    # Remove the dataset from the current algorithm, but keep the model.  Since
    # we passed a reference to self.dataset(), the labeling comes along for
    # free.

    self.algorithm.setDataset(None)
    #self.algorithm.setParameters(None)
    
    #
    # Flag that we've completed the best run

    self.best_run = 1
    warn("ending best clustering %s" %(name), DebugWarning)

  def writeFitnessTable(self, stream, array, best_param):
    """
    writeFitnessTable(stream, label, array, best_param)
    
    write the fitness table tuples out with a header that indicated the best
    parameter.
    """
    
    stream.write("best_param\t%s\n" % best_param)

    for i in range(len(array)):
      tuple = array[i]
      if len(tuple) > 0:
        stream.write(str(tuple[0]))
        for item in tuple[1:]:
          stream.write("\t%s" % (str(item)))
      stream.write("\n")
      
if __name__ == "__main__":
  import compClust.mlx.wrapper.Launcher
  import sys
  
  compClust.mlx.wrapper.Launcher.main(sys.argv + ['--h'], multiRun())