########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
# Distance From Mean Model
#

import Numeric

from compClust.mlx.interfaces import IModel
from compClust.util.DistanceMetrics import Euclidean

class DistanceFromMean(IModel):
  """
  Produces a model which determines fitness by the summing the distances
  to the mean of each cluster.

  This model has the shortcoming that as k increases, the score continues
  to improve.  The reason for this is that the average squared distance
  continually decreases, thus raising the fitness score.
  """
  
  def __init__(self, means=None, data=None, labels=None):
    if means is None:
      if not (data is None or labels is None):
        self.initFromLabels(data, labels)
      else:
        self.means = None
    else:
      self.means = means


  def initFromLabels(self, data, labels):
    """
    Given a dataset and labeling construct compute the means and use as
    the class means.
    """

    from compClust.mlx.models import compute_model_means
    
    self.means = compute_model_means(data, labels)

    
  def evaluateFitness(self, data):
    """
    Return the fitness of the model given a particular set of data.

    The fitness equation is:
    
                    N
         ----------------------
         N-1
         ---- 
         \                       2
          >   (mean_a - point_k )
         /                 
         ----
         k=0
     where point_k is one of the data points, mean_a is the closest
     cluster mean, and N is the number of data points.
     """
    
    N = len(data)

    distances = self.__computeClosestClusterDistances(data)
    if distances is None:
      return 0
    else:
      return N / (Numeric.sum(distances ** 2))


  def __computeClosestClusterDistances(self, data_points):
    """
    Given a datapoint find the its closest cluster and return
    the distance between it and its cluster.
    """

    if data_points is None or self.means is None:
      return None
    
    if len(data_points[0]) != len(self.means[0]):
      raise ValueError(("Dimensionality of the data point [%d] must equal " +
                        "the dimensionality of the cluster means [%d]") %
                       (len(point), len(means[0])))

    
    bestDistances = Numeric.zeros(len(data_points), Numeric.Float)

    for point_index in range(len(data_points)):
      mean = self.means[0]
      point = data_points[point_index]
      bestDistances[point_index] = Euclidean(mean, point)
      
      for mean_index in xrange(1,len(self.means)):
        mean = self.means[mean_index]
        curDistance = Euclidean(mean, point)
        if curDistance < bestDistances[point_index]:
          bestDistances[point_index] = curDistance

    return bestDistances


  def __repr__(self):
    return "KMeansModel(k=%d, means=%s)" % (len(self.means), repr(self.means))