########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################

"""
Usage: KMedians.py parameterFilename datasetFilename resultsFilename
  
 Wrapper for kmedians algorithm

 Depends on the following environment variables:
   KMEDIANS_COMMAND   (e.g., /proj/cluster_gazing2/bin/kmedians)
  
 Algorithm parameters include the following name value pairs.  Unless
 a default is indicated, the parameter is required.
  
     distance_metric
       Either the word "correlation" or "euclidean" (include quotes).
  
     init_medians
       The word "church", or "random", "random_range", "random_sample" or
       "file" (include quotes).

     median_file
       Filename to load initial medians from.
  
     k
       The number of clusters, k, to find.
  
     k_strict
       If "true", kmeans will treat k as a strict parameter.  That is,
       if k clusters could not be found, (after an optional
       num_restarts, in the case of randomly initialized means) no
       result will be reported.  Defaults to "false".
  
     max_iterations
       The maximum number of kmeans iterations.
  
     max_restarts
       The maximum number of restarts in the case of collapsed clusters
       (valid only for randomly initialized means).  Defaults to 0.
  
     num_median_samples
       If init_medians = "random_sample", this parameter indicates the
       number of datapoints to sample (without replacement) when
       estimating initial medians.  Defaults to 3.
  
     seed
       The seed to use for the pseudo-random number generator (valid only
       for randomly initialized means).  Defaults to 42.
  
   An example parameter file:
  
     distance_metric  = "euclidean"
     init_medians     = "random_sample"
     k                = 5
     max_iterations   = 100
     max_restarts     = 10
     num_mean_samples = 3
     seed             = 1234
"""
     
import os
import string
import sys
import types
import tempfile
import warnings
import Numeric

from compClust.config import config
from compClust.util import Verify
from compClust.util import WrapperUtil
from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream

from compClust.mlx.labelings import Labeling
from compClust.mlx.models import DistanceFromMean
import compClust.mlx.ML_Algorithm as ML_Algorithm

import compClust.mlx.wrapper

#
# MESSAGE_STREAM
#

MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%M: KMedians: ")
DEBUG = 0

#
# KMedians
#
# Descriptions - Parameters
k_strict_desc = """If \"true\", kmeans will treat k as a strict parameter.  That is,
 if k clusters could not be found, (after an optional
 num_restarts, in the case of randomly initialized means) no
 result will be reported."""
max_starts_desc = 'The maximum number of restarts in the case of collapsed clusters (valid only for randomly initialized means).'
num_mean_samples_desc = """If init_means = \"random_sample\", this parameter indicates the
 number of datapoints to sample (without replacement) when
 estimating initial means.
"""
seed_desc = 'The seed to use for the pseudo-random number generator (valid only for randomly initialized means).'

import compClust.util.WrapperParameters as wp

class Parameters(wp.WrapperParameters):
  _params = [
    wp.IntProperty('k', 2, min=2,
                   doc='The number of clusters, k, to find.',
                   priority=wp.Priority.REQUIRED),
    wp.ComboProperty('distance_metric', 'euclidean',
                     ['euclidean', 'correlation'],
                     doc='Distance metric of "correlation" or "euclidean"',
                     priority=wp.Priority.REQUIRED),
    wp.ComboProperty('init_medians', 'random',
                     ['church', 'random', 'random_range', 'random_sample'],
                     doc='How to chose inital KMeans cluster means',
                     priority=wp.Priority.REQUIRED),
    wp.IntProperty('num_iterations', 100, min=0,
                   doc="The maximum number of iterations to perform, in case the algorithm doesn't converge.",
                   priority=wp.Priority.REQUIRED),
    # optional parameters
    # FIXME: should this be a boolean property?
    wp.IntProperty('max_restarts', 10,
                   doc=max_starts_desc,
                   priority=wp.Priority.OPTIONAL),
    wp.IntProperty('seed', 42,
                   doc=seed_desc,
                   priority=wp.Priority.OPTIONAL,),
    wp.IntProperty('num_mean_samples', 3,
                   doc=num_mean_samples_desc,
                   priority=wp.Priority.OPTIONAL),
    # internal parameters
    wp.StrProperty('clusteringInputFilename', priority=wp.Priority.INTERNAL),
    wp.StrProperty('clusteringOutputFilename', priority=wp.Priority.INTERNAL),
  ]
  
class KMedians(ML_Algorithm.ML_Algorithm):
  
  def __init__(self, dataset=None, parameters=None):
     """KMedians(dataset, parameters)

     Creates a new KMedians algorithm with the given dataset and
     algorithm parameters.  To run, use the run() method.
     """

     self.dataset    = dataset
     self.parameters = Parameters(parameters)
     self.labeling = None
     self.model = None
      
     self.setMessageStream( MESSAGE_STREAM )
     self.default_tempdir = tempfile.gettempdir()
     
  def copy(self):
    new_obj = KMedians(self.dataset, self.parameters)
    new_obj.labeling = self.labeling
    new_obj.model = self.model
    return new_obj

   
  def getLabeling(self):
    return self.labeling


  def getModel(self):
    if self.model is None:
      dataset = self.dataset
      labeling = self.labeling
      self.model = DistanceFromMean(data=dataset, labels=labeling)
    return self.model


  def run(self):
    """status = run()
      
    Runs the KMedians algorithm.  Sets self.model and self.labeling that
    resulted from the run, returns a status token
    """

    #
    # Invalidate the current model
    #

    self.model = None

    dataset    = self.dataset
    parameters = self.parameters
    wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_DONE
    
    #
    # Creates a temporary directory for kmeans input and output files.
    #
    tempfile.tempdir = WrapperUtil.create_temporary_directory("kmeans_")
    
    clusterOutputFilename = tempfile.mktemp("cluster_output")
    clusterInputFilename  = \
          WrapperUtil.create_clustering_input_file( dataset, tempfile.tempdir )
      
    #  
    # Prepare additional parameters
    #
    parameters[ "clusteringInputFilename"  ] = clusterInputFilename
    parameters[ "clusteringOutputFilename" ] = clusterOutputFilename
    
    #
    # Create kmeans command-line.
    #
    commandLine = self.createClusteringCommandLine()

    #
    # Launch kmeans.
    #
    system_result = os.system(commandLine)
    exit_status = os.WEXITSTATUS(system_result)
    
    if exit_status != 0:
      wrapper_status = compClust.mlx.wrapper.WRAPPER_STATUS_ERROR
    #
    # Load the clustering results into a Labeling and construct a
    # Model from that Labeling.
    #
    
    if os.access( clusterOutputFilename, os.F_OK ):
      stream = open(clusterOutputFilename, "r")
      text   = map(string.strip, stream.readlines())
      self.labeling = Labeling(dataset)
      self.labeling.labelRows(text)
      stream.close()
      os.remove( clusterOutputFilename )

    #
    # Cleanup temporary files and directory.
    #

    files = os.listdir( tempfile.tempdir )
    for file in files:
      os.remove(os.path.join(tempfile.tempdir, file ))
    os.rmdir ( tempfile.tempdir )
    # return tempfile.tempdir to its default tempdir
    tempfile.tempdir = self.default_tempdir
    
    return wrapper_status


  def validate(self):
     """validate()
      
     Returns 1 if all parameters and environment variables nescessary
     to run kmeans are defined, 0 otherwise.
     """

     parameterNames   = [ "distance_metric",
                          "init_medians"     ,
                          "k"              ,
                          "num_iterations" ]
      
     parameters = self.parameters
     error      = 0
      
     if Verify.parameters_exist( parameterNames, parameters ):
       error = 1
      
     return not error


  def createClusteringCommandLine(self):
    """commandLine = createClusteringCommandLine()

    Returns the command-line used to run the KMedians.
    """

    space        = " "
    command      = []
    commandLine  = ""
    parameters   = self.parameters
    
    command.append( config.kmedians_command )
    
    #
    # Rows
    #
    command.append("--rows")
    command.append( repr( self.dataset.getNumRows() ) )
    
    #
    # Columns
    #
    command.append("--cols")
    command.append( repr( self.dataset.getNumCols() ) )
    
    #
    # Distance Metric
    #
    command.append("--distance_metric")
    command.append( parameters["distance_metric"] )
    
    #
    # Median Initialization Method
    #
    command.append("--init_medians")
    command.append( parameters["init_medians"] )
    
    #
    # K
    #
    command.append("--k")
    command.append( repr( parameters["k"] ) )
    
    #
    # Max Iterations
    #
    command.append("--max_iterations")
    command.append( repr( parameters["num_iterations"] ) )
    
    #
    # K Strict
    #
    if parameters.has_key( "k_strict" ):
      if parameters["k_strict"] == "true":
         command.append("--k_strict")

    #
    # Max Restarts
    #
    if parameters.has_key( "max_restarts" ):
      command.append("--max_restarts")
      command.append( repr( parameters["max_restarts"] ) )

    #
    # Num Median Samples
    #
    if parameters.has_key( "num_median_samples" ):
      command.append("--num_median_samples")
      command.append( repr( parameters["num_median_samples"] ) )

    #
    # Seed
    #
    if parameters.has_key( "seed" ):
      command.append("--seed")
      command.append( repr( parameters["seed"] ) )

    #
    # Dataset filename
    #
    command.append( parameters["clusteringInputFilename"] )
      
    #
    # Result filename
    #
    command.append( parameters["clusteringOutputFilename"] )
      
      
    commandLine = space.join(command)
      
    return commandLine


  def create_clustering_command_line(self):
    """command_line = create_clustering_command_line()

    DEPRECATED.  Use createClusteringCommandLine() instead.
    Returns the command-line used to run the KMedians.
    """

    warnings.warn( "Use createClusteringCommandLine() instead.",
                   DeprecationWarning )

    return self.createClusteringCommandLine(self)


if (__name__ == "__main__"):
  from compClust.mlx.wrapper import Launcher

  Launcher.main(sys.argv, KMedians())