########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
#       Author:  Lucas Scharenbroich
#
# Last Modified: Jan. 8, 2002 
#

"""
Usage: Hierarchical cannot be invoked directly from the command line

 Wrapper for Hierarchical clustering

 Brief Algorithm Description:
   
   The hierarchical wrapper is a meta-wrapper which builds a cluster
   hierarchy by spawning off multiple runs of a given algorithm.  The
   wrapper does not depend on a given algorithm being mccv_persistent, but
   it will help performance as many runs may be dispatched in parallel.

   Data/parameter files are not created for each algorithms. Instead, its
   run() method is invoked directly

 Required Parameters:  (note: the list enclosed in the brackets are possible
                              values each one of parameters can take )


 Optional / Dependent Parameters:

   
    prologues are a list of functions that provide 
              pre-conditions for clustering. 
              
    Examples prolgues are:
    clusterSize(size)

    Returns false if the number of datapoints in a cluster falls below <size>
    
    PDRatio(ratio)
 
    Returns false is the ratio of datapoints to dimensions falls below the
    threshold <ratio>
 
              
    epilogues are a list of functions that provide 
              post-conditions for a clustering
              
    resets provide a way to alter the dataset in the 
              hope that the permuted dataset is more 
              likely to be able to be clustered.
    

    Example implementations of terminators are available in 
    compClust.mlx.wrappers.Terminators
    
    
"""

prologue_docs = """
prologues are a list of functions that provide 
          pre-conditions for clustering. 
          
Examples prolgues are:
clusterSize(size)

Returns false if the number of datapoints in a cluster falls below <size>

PDRatio(ratio)

Returns false is the ratio of datapoints to dimensions falls below the
threshold <ratio>
""" 
epilogue_docs = """
epilogues are a list of functions that provide post-conditions 
for a clustering
"""

reset_docs = """           
resets provide a way to alter the dataset in the hope that 
the permuted dataset is more likely to be able to be clustered.
"""              

import os
import re
import sys
import string
import operator
import gc
import Numeric

from compClust.util import Verify
from compClust.util.unique import unique
from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream

from compClust.mlx.datasets import Dataset
from compClust.mlx.labelings import Labeling
from compClust.mlx.wrapper import Terminator

from compClust.mlx.ML_Algorithm import ML_Composable_Algorithm
from compClust.mlx.MultiWayTree import MultiWayTree
from compClust.mlx.Node import *

from compClust.mlx.wrapper import Terminator
import compClust.mlx.wrapper

import compClust.util.WrapperParameters as wp

class Parameters(wp.WrapperParameters):
  _params = [
    wp.ListProperty('prologues',
                    default=[Terminator.clusterSize(10)],
                    doc=prologue_docs,
                    priority=wp.Priority.OPTIONAL,),
    wp.ListProperty('epilogues',
                    doc=epilogue_docs,
                    priority=wp.Priority.OPTIONAL,),
    wp.ListProperty('resets',
                    doc=reset_docs,
                    priority=wp.Priority.OPTIONAL,),
  ]

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%M: Hierarchical: ")

class Hierarchical(ML_Composable_Algorithm):
  def __init__(self, dataset = None, parameters = None, algorithm = None):

    self.setMessageStream( MESSAGE_STREAM )
    
    self.dataset    = dataset
    self.parameters = Parameters(parameters)
    self.algorithm  = algorithm

    self.model      = None;
    self.labeling   = None;

  def copy(self):
    newObj = Hierarchical(self.dataset, self.parameters, self.algorithm)
    newObj.labeling = self.labeling
    newObj.model    = self.model
    return newObj

  def clear(self):

    ML_Algorithm.clear(self)
    if self.algorithm is not None:
      self.algorithm.clear()
       
  def getLabeling(self):
    return self.labeling

  def getModel(self):
    return self.model

  #
  # Built-in prologue functions
  #

  def trueTerminator(self):
    return lambda node : 1

  def falseTerminator(self):
    return lambda node : 0

  def algoTerminator(self):
    return lambda node : node.algorithm is not None

  #
  # Built-in epilogue functions
  #
  
  def labelingTerminator(self):
    return lambda node : node.algorithm.getLabeling() is not None

  def clusterNumTerminator(self):
    return lambda node : len(node.algorithm.getLabeling().getLabels()) != 1

  #
  # End built-in functions
  #
  
  def run(self):
    """
    run()

    Takes the given dataset and passes it off to some ML_Algorithm.  Once that
    algorithm returns, each cluster is passed off again.  This continues until
    a termination condition is reached or the ML_Algorithm fails for some
    reason.

    """
    
    #
    # Create the list of prolouge and epilouge functions.  Both prologues and
    # epilogues can stop a branch of the clustering, however prologues are
    # pre-conditions for clustering, while epilouges are post-conditions
    #

    self.prologues = map(eval, self.parameters[ "prologues" ])
    self.epilogues = map(eval, self.parameters[ "epilogues" ])
    self.resets    = map(eval, self.parameters[ "resets"    ]) 

    #
    # The model for hierarchical clustering is simply a tree
    #

    tree = self.buildHierarchy(self.algorithm);

    #
    # Now produce a labeling for the data
    #

    if tree is None:
      return compClust.mlx.wrapper.WRAPPER_STATUS_ERROR
    
    #
    # Go through every leaf of the tree and assign based on their
    # RowIDs.  Note that it is possible that if the top-level clustering fails
    # that every label is the empty string.
    #

    text    = [[-1]] * self.dataset.getNumRows()

    iter = tree.iterator('CLR')
    node = iter.next()

    while node is not None:
      if tree.isLeaf(node.key()):
        for i in range(len(node.partition)):
          text[node.partition[i]] = string.join(map(str, node.prefix),",")

      node = iter.next()
    
    #
    # Use the list to produce the correct labeling
    #
    # There is currently no model associated with the
    # Hierarchical clustering method
    #

    tree.clear()

    self.labeling = Labeling(self.dataset)
    self.labeling.labelRows(text)

    return compClust.mlx.wrapper.WRAPPER_STATUS_DONE


  def __allVoteYes(self, flist, arg):
    """
    Returns true if all the functions in list flist evaluate to true given the
    arguement arg.
    """

    votes = map(lambda f : f(arg), flist)
    return reduce(lambda x,y: x and y, votes) 

    
  def __allVoteNo(self, flist, arg):
    """
    Returns true if all the functions in list flist evaluate to false given the
    arguement arg
    """

    votes = map(lambda f : f(arg), flist)
    return not reduce(lambda x,y: x or y, votes) 

  def buildHierarchy(self, algorithm):
    """ buildHierarchy(algorithm)

    Build the Hierarchical tree in an interative manner.  The root node
    is set to the algorithm provided and a pre-order iterator is invoked
    on the root node.  Now for each node the iterator returns, run the
    algorithm, append the child nodes and take the .next() node from the
    iterator.

    This approach amounts to staying one step ahead of the iterator while
    building the tree and letting the iterator do the work of keeping track
    of the nodes visited.
    """

    #
    # As the tree is build we do not want to keep the datasets/labelings
    # in memory any longer than neccessary.  So, to be able to track the
    # datapoints through the tree, each node will have a partition list which
    # contains a set of tuples [index, class] where index is the index of the
    # datapoint in the original dataset and class is the class to which it
    # was assigned.
    #
    # Also note that these tuple only need to be stored in the leaf nodes, so
    # as the tree decends, the tuples in the parent nodes should be removed.
    #
    # Each node also contains an algorithm objects, but that object should
    # remove its dataset and labeling after running and only keep a copy
    # of its parameters.
    #

    numRows        = self.dataset.getNumRows()
    tree           = MultiWayTree()
    currKey        = 0
    
    root           = Node(currKey)
    currKey       += 1

    tree.insert(root)
    
    root.algorithm = algorithm
    root.prefix    = ()
    root.partition = tuple(range(numRows))
    
    if algorithm is None:
      return root
    
    #
    # Append the very basic epilouge and prologue functions
    #

    prologues = self.prologues
    prologues.insert(0, self.trueTerminator() )
    prologues.insert(0, self.algoTerminator() )
    
    epilogues = self.epilogues
    epilogues.insert(0, self.labelingTerminator() )
    epilogues.insert(0, self.clusterNumTerminator() )

    resets    = self.resets
    resets.append( self.falseTerminator() )
    
    #
    # Instantiate an iterator on the root
    #

    iter = tree.iterator('CLR')

    node = iter.next()
    while node is not None:

      #
      # Build the dataset object on the fly.  We provide a copy of the
      # parameters since the Terminators (and algorithms themselves)
      # can place whatever data they wish in the parameters sets and
      # we can't have runs interfering with each other.
      #

      data = self.dataset.getData()

      #
      # For the root node, we don't have to make a subset.  This saves
      # a considerable amount of memory
      #
      
      if len(node.prefix) == 0:
        currentDataset = Dataset(data)
      else:
        currentDataset = Dataset(Numeric.take(data, node.partition))
      data = None
            
      #
      # Evaluate the list of prologues and only continue if they all return
      # true.  There will be at least one default prologue which always
      # returns true
      #

      node.algorithm.setDataset(currentDataset)
      node.algorithm.setParameters(self.parameters.copy())
      
      if self.__allVoteYes(prologues, node):

        #
        # Run the algorithm for the first time
        #

        status = node.algorithm.run()
        if status == compClust.mlx.wrapper.WRAPPER_STATUS_ERROR:
          MESSAGE_STREAM.write("algorithm.run() terminated with an error\n")

        #
        # Check if anyone has voted to reset, or retry, this clustering run.
        # Be VERY careful defining these functions as they may lead to
        # an infinite loop.  It is assumed that a reset function permutes the
        # algorithm in some way to give it a better chance.
        #
        # The default reset function always returns 0
        #

        while not self.__allVoteNo(resets, node):
          status = node.algorithm.run()

        #
        # Fetch the labeling and create the partition lists, we need to do
        # this here because the epilogue functions can change the algorithm
        # any way they want.  Also remove the class labeling when finished.
        #

        numRows        = currentDataset.getNumRows()
        labeling       = node.algorithm.getLabeling()
        
        if labeling is not None:
          sublists = map(labeling.getRowsByLabel, labeling.getLabels())
        else:
          sublists = [range(numRows)]

        labeling = None

        #
        # Evaluate the list of epilogues.  There are two default epilogues,
        # one which checks for a labeling, and one which determines if
        # there is more than 1 cluster.
        #
        # Epilogue functions decide if another level of clustering should be
        # performed
        #

        unanimous = self.__allVoteYes(epilogues, node)

        #
        # We are continuing down the tree, so we can remove the current
        # node's partition after we create the children.  
        #
        # Dataset and labeling should both be None at this point so the
        # .copy() method only copies the parameters
        #
        # If the vote is NO, act similarly to above (chreate child nodes),
        # but _only_ fill in the partition information and reverse the tree
        # interation
        #
        
        for i in range(len(sublists)):

          sublist = sublists[i]
            
          newNode        = Node(currKey)
          currKey       += 1
          
          tmp            = tuple(str(i+1))
          newNode.prefix = node.prefix + tmp

          #
          # Copy the correct index numbers
          #

          length = len(sublist)
          newNode.partition = tuple(map(operator.getitem,
                                        [node.partition] * length, sublist))

          #
          # If it is determined that the next node will have an algorithm
          # to run, we will need to make a new instance of the algorithm.
          #
          # We have to use a .copy() instead of the AlgorithmFactory
          # because some algorithms (like MCCV) have information other than
          # their datasets and parameters.  In the case of MCCV, we were
          # losing the reference to the underlying algorithm and MCCV
          # would crash since mccv.algorithm = None
          #
          
          algo = None
          if unanimous:
            algo = node.algorithm.copy()
            algo.clear()
          
          newNode.algorithm = algo
          tree.append(node.key(), newNode)
          
        #
        # Now remove the class/partition information from the current node
        #

        node.prefix = None
        node.partition = None

        if not unanimous:
          iter.reverse()

      #
      # Clear the algorithm, this should remove the labeling from the
      # subset used as the dataset and also all references to the parameters,
      # model and labeling
      #

      if node.algorithm is not None:
        node.algorithm.clear()
        node.algorithm = None
      
      #
      # Remove the dataset from the current node
      #

      currentDataset.resetVars()
      currentDataset = None

      #
      # garbage collect to keep memory free
      #

      gc.collect()
      
      #
      # We've constructed all the children for this node, so move on to
      # the next one.
      #

      node = iter.next()

    #
    # If finished, return the tree
    #

    return tree
  
  def validate(self):
    """validate()

    Ensures that all parameters and environment variables nescessary
    to run the clustering algorithm (Hierarchical) are defined.
    """

    parameter_names = []
    
    fail = 0
      
    if Verify.parameters_exist( parameter_names, self.parameters ):
      fail = 1

    #
    # Optional parameters
    #

    if not self.parameters.has_key( "prologues" ):
      self.parameters[ "prologues" ] = []

    if not self.parameters.has_key( "resets" ):
      self.parameters[ "resets" ] = []

    if not self.parameters.has_key( "epilogues" ):
      self.parameters[ "epilogues" ] = []

    return not fail

if __name__ == "__main__":
  import compClust.mlx.wrapper.Launcher
  
  compClust.mlx.wrapper.Launcher.main(sys.argv + ['--h'], Hierarchical())
