########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################

"""
Usage: KMeans.py parameterFilename datasetFilename resultsFilename
  
 Wrapper for kmeans algorithm

 Depends on the following environment variables:
   KMEANS_COMMAND   (e.g., /proj/cluster_gazing2/bin/kmeans)
  
 Algorithm parameters include the following name value pairs.  Unless
 a default is indicated, the parameter is required.
  
     distance_metric
       Either the word "correlation" or "euclidean" (include quotes).
  
     init_means
       The word "church", or "random", "random_range", or
       "random_sample" (include quotes).
  
     k
       The number of clusters, k, to find.
  
     k_strict
       If "true", kmeans will treat k as a strict parameter.  That is,
       if k clusters could not be found, (after an optional
       num_restarts, in the case of randomly initialized means) no
       result will be reported.  Defaults to "false".
  
     num_iterations
       The number of kmeans iterations.
  
     max_restarts
       The maximum number of restarts in the case of collapsed clusters
       (valid only for randomly initialized means).  Defaults to 0.
  
     num_mean_samples
       If init_means = "random_sample", this parameter indicates the
       number of datapoints to sample (without replacement) when
       estimating initial means.  Defaults to 3.
  
     seed
       The seed to use for the pseudo-random number generator (valid only
       for randomly initialized means).  Defaults to 42.
  
   An example parameter file:
  
     distance_metric  = "euclidean"
     init_means       = "random_sample"
     k                = 5
     num_iterations   = 100
     max_restarts     = 10
     num_mean_samples = 3
     seed             = 1234
"""
     
import os
import string
import sys
import types
import tempfile
import warnings
import Numeric

from compClust.config import config
from compClust.util import Verify
from compClust.util import WrapperUtil
from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream

from compClust.mlx.labelings import Labeling, ClusteredLabeling
from compClust.mlx.models import DistanceFromMean
import compClust.mlx.ML_Algorithm as ML_Algorithm

import compClust.mlx.wrapper

#
# MESSAGE_STREAM
#

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%M: KMeans: ")
DEBUG = 0

#
# KMeans Parameter Def
#

# Descriptions - Parameters
k_strict_desc = """If \"true\", kmeans will treat k as a strict parameter.  That is,
 if k clusters could not be found, (after an optional
 num_restarts, in the case of randomly initialized means) no
 result will be reported."""
max_starts_desc = 'The maximum number of restarts in the case of collapsed clusters (valid only for randomly initialized means).'
num_mean_samples_desc = """If init_means = \"random_sample\", this parameter indicates the
 number of datapoints to sample (without replacement) when
 estimating initial means.
"""
seed_desc = 'The seed to use for the pseudo-random number generator (valid only for randomly initialized means).'

import compClust.util.WrapperParameters as wp

class Parameters(wp.WrapperParameters):
  _params = [
    wp.IntProperty('k', 2, min=2,
                   doc='The number of clusters, k, to find.',
                   priority=wp.Priority.REQUIRED),
    wp.ComboProperty('distance_metric', 'euclidean',
                     ['euclidean', 'correlation'],
                     doc='Distance metric of "correlation" or "euclidean"',
                     priority=wp.Priority.REQUIRED),
    wp.ComboProperty('init_means', 'random',
                     ['church', 'random', 'random_range', 'random_sample'],
                     doc='How to chose inital KMeans cluster means',
                     priority=wp.Priority.REQUIRED),
    wp.IntProperty('num_iterations', 100, min=0,
                   doc="The maximum number of iterations to perform, in case the algorithm doesn't converge.",
                   priority=wp.Priority.REQUIRED),
    # optional parameters
    # FIXME: should this be a boolean property?
    wp.IntProperty('max_restarts', 10,
                   doc=max_starts_desc,
                   priority=wp.Priority.OPTIONAL),
    wp.IntProperty('seed', 42,
                   doc=seed_desc,
                   priority=wp.Priority.OPTIONAL,),
    wp.IntProperty('num_mean_samples', 3,
                   doc=num_mean_samples_desc,
                   priority=wp.Priority.OPTIONAL),
    # internal parameters
    wp.StrProperty('clusteringInputFilename', priority=wp.Priority.INTERNAL),
    wp.StrProperty('clusteringOutputFilename', priority=wp.Priority.INTERNAL),
    wp.IntProperty('rows', priority=wp.Priority.INTERNAL),
    wp.IntProperty('cols', priority=wp.Priority.INTERNAL),

    #wp.StrProperty('clusteringMeansFilename', priority=wp.Priority.INTERNAL),
    #wp.StrProperty('clusteringVarianceFilename',priority=wp.Priority.INTERNAL),
    #wp.StrProperty('clusteringWeightsFilename', priority=wp.Priority.INTERNAL),
    #wp.StrProperty('clusteringProbsFilename', priority=wp.Priority.INTERNAL),
    #wp.StrProperty('clusteringInputFilename', priority=wp.Priority.INTERNAL),
    #wp.StrProperty('clusteringInternalFilename', priority=wp.Priority.INTERNAL),
    
    ]

#
# KMeans
#

class KMeans(ML_Algorithm.ML_Algorithm):
  
  def __init__(self, dataset=None, parameters=None):
    """KMeans(dataset, parameters)

    Creates a new KMeans algorithm with the given dataset and
    algorithm parameters.  To run, use the run() method.
    """
    self.dataset    = dataset
    self.parameters = Parameters(parameters)
    self.labeling = None
    self.model = None

    self.default_tempdir = tempfile.gettempdir()
     
    self.setMessageStream( MESSAGE_STREAM )


  def copy(self):
    new_obj = KMeans(self.dataset, self.parameters)
    new_obj.labeling = self.labeling
    new_obj.model = self.model
    return new_obj

   
  def getLabeling(self):
    return self.labeling


  def getModel(self):
    if self.model is None:
      dataset = self.dataset
      labeling = self.labeling
      self.model = DistanceFromMean(data=dataset, labels=labeling)
    return self.model


  def run(self):
    """status = run()
      
    Runs the KMeans algorithm.  Sets self.model and self.labeling that
    resulted from the run, returns a status token
    """
    #
    # Invalidate the current model
    #

    self.model = None

    dataset    = self.dataset
    parameters = self.parameters
    wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_DONE
    
    #
    # Creates a temporary directory for kmeans input and output files.
    #
    tempfile.tempdir = WrapperUtil.create_temporary_directory("kmeans_")
    
    clusteringOutputFilename = tempfile.mktemp("cluster_output")
    clusteringInputFilename  = \
          WrapperUtil.create_clustering_input_file( dataset, tempfile.tempdir )
      
    #  
    # Prepare additional parameters
    #
    parameters[ "clusteringInputFilename"  ] = clusteringInputFilename
    parameters[ "clusteringOutputFilename" ] = clusteringOutputFilename
    parameters[ "rows" ]                  = dataset.getNumRows()
    parameters[ "cols" ]                  = dataset.getNumCols()
    
    #
    # Create kmeans command-line.
    #
    commandLine = self.createClusteringCommandLine()
    
    #
    # Launch kmeans.
    #    
    system_result = os.system(commandLine)
    # WIN32 systems don't have WEXITSTAUS
    # also win95/98/me doesn't return the exit status
    # this suggests KMeans won't know that it failed under those OSs
    # good thing most people have upgraded to at least win2k
    if os.__dict__.has_key('WEXITSTATUS'):
      exit_status = os.WEXITSTATUS(system_result)
    else:
      exit_status = system_result
    
    if exit_status != 0:
      wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_ERROR
    #
    # Load the clustering results into a Labeling and construct a
    # Model from that Labeling.
    #
    
    if os.access( clusteringOutputFilename, os.F_OK ):
      stream = open(clusteringOutputFilename, "r")
      text   = map(string.strip, stream.readlines())
      self.labeling = ClusteredLabeling(dataset, self.__class__, parameters)
      self.labeling.labelRows(text)
      stream.close()
      os.remove( clusteringOutputFilename )
    else:
      # Oops no output file, perhaps the program failed  
      wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_ERROR  

    #
    # Cleanup temporary files and directory.
    #

    files = os.listdir( tempfile.tempdir )
    for file in files:
      os.remove(os.path.join(tempfile.tempdir, file ))
    os.rmdir ( tempfile.tempdir )
    # return tempfile.tempdir to its default tempdir
    tempfile.tempdir = self.default_tempdir
    
    return wrapper_status


  def validate(self):
     """validate()
      
     Returns 1 if all parameters and environment variables nescessary
     to run kmeans are defined, 0 otherwise.
     """

     parameterNames   = [ "distance_metric",
                          "init_means"     ,
                          "k"              ,
                          "num_iterations" ]
      
     parameters = self.parameters
     error      = 0
      
     if Verify.parameters_exist( parameterNames, parameters ):
       error = 1
      
     return not error


  def createClusteringCommandLine(self):
    """commandLine = createClusteringCommandLine()

    Returns the command-line used to run the KMeans.
    """

    space        = " "
    command      = []
    commandLine  = ""
    parameters   = self.parameters
    
    command.append( config.kmeans_command )
    
    #
    # Rows
    #
    command.append("--rows")
    command.append( repr( parameters["rows"] ) )
    
    #
    # Columns
    #
    command.append("--cols")
    command.append( repr( parameters["cols"] ) )
    
    #
    # Distance Metric
    #
    command.append("--distance-metric")
    command.append( parameters["distance_metric"] )
    
    #
    # Mean Initialization Method
    #
    command.append("--init-means")
    command.append( parameters["init_means"] )
    
    #
    # K
    #
    command.append("--k")
    command.append( repr( int(parameters["k"]) ) )
    
    #
    # Max Iterations
    #
    command.append("--max-iterations")
    command.append( repr( int(parameters["num_iterations"]) ) )
    
    #
    # K Strict
    #
    if parameters.has_key( "k_strict" ):
      if parameters["k_strict"] == "true":
         command.append("--k-strict")

    #
    # Max Restarts
    #
    if parameters.has_key( "max_restarts" ):
      command.append("--max-restarts")
      command.append( repr( int(parameters["max_restarts"] )) )

    #
    # Num Mean Samples
    #
    if parameters.has_key( "num_mean_samples" ):
      command.append("--num-mean-samples")
      command.append( repr( int(parameters["num_mean_samples"] )) )

    #
    # Seed
    #
    if parameters.has_key( "seed" ):
      command.append("--seed")
      command.append( repr( int(parameters["seed"] )) )

    #
    # Dataset filename
    #
    command.append( parameters["clusteringInputFilename"] )
      
    #
    # Result filename
    #
    command.append( parameters["clusteringOutputFilename"] )
      
      
    commandLine = space.join(command)
      
    return commandLine


  def create_clustering_command_line(self):
    """command_line = create_clustering_command_line()

    DEPRECATED.  Use createClusteringCommandLine() instead.
    Returns the command-line used to run the KMeans.
    """

    warnings.warn( "Use createClusteringCommandLine() instead.",
                   DeprecationWarning )

    return self.createClusteringCommandLine(self)


if (__name__ == "__main__"):
  from compClust.mlx.wrapper import Launcher

  Launcher.main(sys.argv, KMeans())



