########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
#       Author: Lucas Scharenbroich
# 
# Original Implementation: September 8, 2002 by Lucas Scharenbroich

"""
Usage: SClust.py parameter_filename input_filename output_filename

 Wrapper for the sclust algorithm.

 Depends on the following environment variables:
   SCLUST_COMMAND   (e.g., /proj/cluster_gazing2/bin/sclust)

 Brief Algorithm Description:
   Sclust is a soft clustering algorithm. The program implements a form of 
   clustering derived via mean field approximation and globally optimized
   using deterministic annealing.  A slack cluster is included. 

 Required Parameters:

         k               = <x>

                           x is the number of clusters to find

 Optional / Dependent Parameters:

         threshold       = <x>

                           By default is set to 0.01.  This controls the
                           threshold of what classes to keep in a sparse
                           matrix structure.  If the class weight is below
                           the threshold, it is removed from the sparse
                           matrix.
 
         seed            = <x> (optional)

                           Where x is the number used to seed the random
                           number generator.  This parameter allows runs
                           of the algorithm to be deterministic.  If the
                           parameter is omitted, it will be initialized
                           42
                           
         slack           = ['on', 'off']

                           Turns on a slack cluster.  This will create k+1
                           clusters where 'other' points go in the slack
                           cluster.  The 'reward' parameters influences the
                           slack cluster's affinity.  Defaults to 'off'
                           Note the actual number of classes returned by the
                           algorithm is still k.
                           
         reward          = <x>

                           These is the reward value for points to avoid the
                           slack cluster.  If this value is too low, all the
                           data points might be assigned to the slack cluster.
                           By default the value if 50.  It should be
                           proportional to the distances between data points.
                           
         annealing       = ['on', 'off']

                           Turns on annealing.  If not speicified assumed to
                           be 'off'

         initial_temp    = <x> (depends on annealing)

                           Starting temperature to run the annealer at.

         schedule        = <x> (depends on annealing)

                           Temperature schedule.  The initial_temp is
                           multiplied by this number every step.  Needs to be
                           in the range (0.0, 1.0), but should be in the
                           high 0.90s.
"""

import os
import sys
import tempfile
import string

from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream
from compClust.util import Verify
from compClust.util import WrapperUtil

from compClust.mlx.labelings import Labeling
from compClust.mlx.models import MixtureOfDiagonalGaussians
from compClust.mlx.ML_Algorithm import ML_Algorithm

import compClust.mlx.wrapper

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%m: SClust: ")

class SClust(ML_Algorithm):
  def __init__(self, dataset = None, parameters = None):
    self.setMessageStream( MESSAGE_STREAM )
    self.dataset    = dataset
    self.parameters = parameters
    self.model      = None
    self.labeling   = None

    self.default_tempdir = tempfile.gettempdir()
    
  def copy(self):
    new_obj = SClust(self.dataset, self.parameters)
    new_obj.labeling = self.labeling
    new_obj.model = self.model
    return new_obj

  
  def getLabeling(self):
    return self.labeling


  def getModel(self):
    if self.model is None:
      dataset = self.dataset
      labeling = self.labeling
      self.model = MixtureOfDiagonalGaussians(data=dataset, labels=labeling)
    return self.model
      
      
  def run(self):
    """run(self)
      
    Prepares the inputs to the clustering algorithm (sclust) and runs it.
    """

    #
    # Assume success.  This will get changed if not the case
    #
    
    wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_DONE
    
    #
    # Invalidate the current model
    #

    self.model = None
    
    #
    # Creates a temporary directory for input and output files
    #

    tempfile.tempdir = WrapperUtil.create_temporary_directory("tsplit_")

    #  
    # Prepare data file and store in temporary location.
    #

    self.parameters["clusterInputFilename"]   = tempfile.mktemp(".i")
    self.parameters["clusterParamFilename"]   = tempfile.mktemp(".p")
    self.parameters["clusterOutputFilename"]  = tempfile.mktemp(".o")
    
    outputFilename = self.parameters["clusterOutputFilename"]

    #
    # Build the input file
    #

    self.create_input_file()

    #
    # Create sclust command-line.
    #

    command_line = self.create_clustering_command_line()

    #
    # Launch sclust.
    #

    system_result = os.system(command_line)
    exit_status = os.WEXITSTATUS(system_result)
    
    if exit_status != 0:
      wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_ERROR

    #
    # Load the clustering results into a Labeling and construct a
    # Model from that Labeling.
    #
    
    if os.access( outputFilename, os.F_OK ):

      rows = self.dataset.getNumRows()

      #
      # Read in the data manually
      #
      
      stream = open(outputFilename, "r")
      text   = map(string.strip, stream.readlines())
      stream.close()

      #
      # Strip out only the class information
      #

      self.labeling = Labeling(self.dataset)
      self.labeling.labelRows(text[1:rows+1])
      
    #
    # Cleanup temporary files.
    #

    files = os.listdir( tempfile.tempdir )
    for file in files:
      os.remove(os.path.join( tempfile.tempdir, file ))
    os.rmdir ( tempfile.tempdir )

    #
    # return tempfile.tempdir to its default tempdir
    #
    tempfile.tempdir = self.default_tempdir

    return compClust.mlx.wrapper.WRAPPER_STATUS_DONE

 
  def create_input_file(self):
    """
    create_input_file(self)
    """

    #
    # The input file contains a header with algorithm parameters followed
    # by raw, whitespace-delimited (read by fscanf()) data.  Blank lines or
    # lines beginning with a '#' are skipped
    #
    # Format:
    #
    # rows columns \n
    # min_clusters max_clusters step_size \n
    # num_runs num_samples seed \n
    #
    # commit_reward \n default to 50
    # betamin \n       default to 0.01
    # betamax \n       default to 1.0
    # betafactor \n    default to 1.001
    # Mthreshfac \n    default to 0.01 
    # slack_flag \n    default to 0
    #
    # data ....
    #
    # cv params

    destStream   = open(self.parameters["clusterInputFilename"], "w")    

    num_rows = self.dataset.getNumRows()
    num_cols = self.dataset.getNumCols()
    k        = self.parameters['k']
    seed     = self.parameters['seed']

    #
    # Write the basic parameters
    #
    
    destStream.write("%s %s\n" % (num_rows, num_cols))
    destStream.write("%s %s 1\n" % (k, k))
    destStream.write("1 %s %s\n\n" % (num_rows, seed))

    #
    # Optional information
    #
    
    destStream.write("%s\n" % self.parameters.get('reward', 50))

    #
    # Annealing parameteres
    #
          
    if self.parameters['annealing'] == 'on':
      destStream.write("%s\n" % (1.0 / float(self.parameters['initial_temp'])))
      destStream.write("1.0\n")
      destStream.write("%s\n" % (1.0 / float(self.parameters['schedule'])))
    else:
      destStream.write("0.9995\n")
      destStream.write("1.0\n")
      destStream.write("1.001\n")

    #
    # Sparse matrix threshold
    #

    destStream.write("%s\n" % self.parameters.get('threshold', 0.01))

    #
    # Slack clustering
    #

    destStream.write("%s\n" % (self.parameters.get('slack', 'off') == 'on'))
    
    self.dataset.writeDataset(destStream)
    destStream.close()

  def validate(self):
    """validate_parameters(self)
      
    Ensures that all parameters and environment variables nescessary
    to run the clustering algorithm (sclust) are defined.
    """

    #
    # These are the _required_ parameters needed.
    #

    environment_names = [ "SCLUST_COMMAND" ]
    parameter_names   = [ "k" ]
      
    err = 0
      
    if Verify.environment_variables_exist( environment_names ):
      err = 1
      
    if Verify.parameters_exist( parameter_names, self.parameters ):
      err = 1

    #
    # Explicitly check for a seed, if one does not exist provide a default
    # value
    #
    
    if not self.parameters.has_key( "seed" ):
      self.parameters[ "seed" ] = 42
      
    if self.parameters.setdefault("annealing", "off") == "on":
      if Verify.parameters_exist( ["initial_temp", "schedule"],
                                  self.parameters):
        err = 1

    return not err
      
      
  def create_clustering_command_line(self):
    """command_line = create_clustering_command_line(self)
     
    """
     
    space        = " "
    command      = []
    command_line = ""
  
    #
    # SClust expects two command line parameters, an input filename
    # and an output filename
    #
    
    command.append(os.environ["SCLUST_COMMAND"])
  
    #
    # Input filename
    #
  
    command.append(self.parameters["clusterInputFilename"])
    
    #
    # Output file name (the final file name)
    #
  
    command.append(self.parameters["clusterOutputFilename"])
   
    #
    # Send stdout output to the bit bucket
    #
  
    command_line = space.join(command)
    
    return command_line
  
if (__name__ == "__main__"):
  from compClust.mlx.wrapper import Launcher

  Launcher.main(sys.argv, SClust())