########################################
# The contents of this file are subject to the MLX PUBLIC LICENSE version
# 1.0 (the "License"); you may not use this file except in
# compliance with the License.
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Source Code is "compClust", released 2003 September 03.
# 
# The Original Source Code was developed by the California Institute of
# Technology (Caltech).  Portions created by Caltech are Copyright (C)
# 2002-2003 California Institute of Technology. All Rights Reserved.
########################################
#
#       Authors: Christopher Hart
#                Lucas Scharenbroich
#                Diane Trout
#
# Last Modified: $Date: 2003/09/04 01:51:33 $
#

"""
This module provides a collection of functions to compute distance metrics
between vectors.
"""

import math
import Numeric
import MLab
import LinearAlgebra

def distance(vector, array, metric='euclidean'):
    """
    Compute the distance between a vector and a vector of vectors.

    This is an optimized interface to provide for the rapid evaluation of
    a set of distance calculations.  The distance metric can be chosen
    by name.  The current choices are:

    * euclidean
    * manhattan
    * maximum
    * correlation
    * spearman
    """
    
    if metric == 'euclidean':
      return EuclideanDistance(vector, array)
    elif metric == 'correlation':
      return CorrelationDistance(vector, array)
    elif metric == 'spearman':
      return SpearmanCorrelation(vector, array)
    elif metric == 'manhattan':
      return ManhattanDistance(vector, array)
    elif metric == 'maximum':
      return MaximumDistance(vector, array)
    else:
      raise ValueError()


def EuclideanDistance(vector, array):
  """
  distance vector = EuclideanDistance(vector, array)

  Compute the Euclidean distance between the vector and each row of the array.
  For an m-by-n array and a 1-by-n vector, a 1-by-m vector containing the
  distances is returned.  

                   k
  De(X,Y) = sqrt( SUM (x - y )^2 )
                  i=1   i   i
  """

  if len(array.shape) == 1:
    array = Numeric.reshape(array, (1, len(array)))
    
  return Numeric.sqrt(Numeric.sum((array - vector)**2, 1))
  
def ManhattanDistance(vector, array):
  """
  distance vector = ManhattanDistance(vector, array)

  Computes the city-block distance which is a cheaper operation than
  Euclidean distance but may be useful depending on the context.  This
  is also known as the City Block distance

             k
  Dc(X,Y) = SUM | x - y |
            i=1    i   i
  """
  if len(array.shape) == 1:
    array = Numeric.reshape(array, (1, len(array)))

  return Numeric.sum(Numeric.fabs((array - vector)), 1)


def MaximumDistance(vector, array):
  """
  distance vector = MaximumDistance(vector, array)

  Compute the maximum distance which is defined as:

  Dm(X,Y) = max{|x_i - y_i|, i = 1, ..., n}

  also called the Chessboard distance

  """

  if len(array.shape) == 1:
    array = Numeric.reshape(array, (1, len(array)))

  return MLab.max(Numeric.fabs(array - vector), 1)


def CorrelationDistance(vector, array):
  """
  distance vector = CorrelationDistance(vector, array)

  Computes the correlation distance between points which is the inner
  product of two normalized means.  No point may be the zero point

             n          x_i*y_i                    x.y
  Dc(X,Y) = SUM   --------------------------- =  ------
            i=1       (  n     2    n     2 )    |x||y|
                  sqrt( SUM x_i  * SUM y_i  )
                      ( i=1        i=1      )
  """

  if len(array.shape) == 1:
    array = Numeric.reshape(array, (1, len(array)))

  u = Numeric.sum(vector ** 2)
  w = Numeric.sqrt(u * Numeric.sum(array ** 2, 1))

  return Numeric.sum(Numeric.transpose(vector*array) / w)

def InverseCorrelationDistance(vector, array):
   """ 1-CorrelationDistance.  This is usefull for tools that expect things that are closer 
   to each other to have a smaller distance and things that are further apart to have a large distance """
   return(1-CorrelationDistance(vector, array))

def PearsonCorrelation(vector, mean, array):
  """
  distance vector = PearsonCorrelation(vector, mean, array)
  
  Pearson distance is similar to the standard correlation distance, except
  the vector are correlated about a particular mean vector.  If the vector
  is the zero vector, the Pearson metric is identical to the correlation
  metric.
  """

  return CorrelationDistance(vector - mean, array - mean)

def ranks(x):
  """
  ranks(x)

  Code similar to the octave function of the same name.  If x is a
  vector, return a vector of ranks of x adjusted for ties.
  If x is a matrix, do the above for each row.
  """

  if len(x.shape) == 1:
    p       = Numeric.outerproduct(x, Numeric.ones(x.shape))
    p_prime = Numeric.transpose(p)
    y       = Numeric.sum(p < p_prime) + (Numeric.sum(p == p_prime) + 1) / 2.0

  else:

    y       = []
    
    for i in range(len(x)):
      p       = Numeric.outerproduct(x[i], Numeric.ones(x[i].shape))
      p_prime = Numeric.transpose(p)
      r     = Numeric.sum(p < p_prime) + (Numeric.sum(p == p_prime) + 1) / 2.0
      y.append(r)

    y = Numeric.array(y)
    
  return y
  
def SpearmanCorrelation(vector, array):
  """
  distance vector = SpearmanCorrelation(vector, array)

  Spearman correlation differs from Pearson correlation in that it does
  not care about the values in the vectors, but only their relative rank.

  The vector and array given are augmented by replacing the ith element
  of each vector with its rank within the samples.

                     6      k                2
  Ds(x,y) = 1 - ---------- SUM (R(x) - R(y) )
                n(n^2 - 1) i=0      i      i

  where n is the number of sample and k is the length of each vector
  """

  #
  # Check for 1-D input arrays and move them to 1xn 2-D matricies
  #

  if len(vector.shape) == 1:
    vector = Numeric.reshape(vector, (1, len(vector)))

  if len(array.shape) == 1:
    array = Numeric.reshape(array,   (1, len(array)))

  #
  # Combine the inputs into a single, transposed matrix and ensure that 
  # it is contiguous
  #

  fullArray = Numeric.transpose(Numeric.concatenate((array, vector)))
  fullArray = Numeric.reshape(Numeric.ravel(fullArray), fullArray.shape)
  
  #
  # Compute the ranks
  #

  rankArray = Numeric.transpose(ranks(fullArray))
  
  #
  # ...and pop off the input vector, and resize the array
  #
  
  rankVector = rankArray[-1]
  
  shape      = rankArray.shape
  rows       = shape[0]
  rankArray  = Numeric.resize(rankArray, (shape[0] - 1, shape[1]))

  #
  # Subtract the ranks, sum the vectors
  #

  corr = Numeric.sum(Numeric.transpose((rankArray - rankVector) ** 2))

  corr *= -6
  corr /= (rows**3 - rows)
  corr += 1

  #
  # Normalize
  #

  return corr


def MahalanobisDistance(vector, mean, covariance):
  """
  distance value = MahalanobisDistance(vector, mean, covariance)

  Computes the Mahalanobis distance between the point defined in vector and
  a class with mean value defines in mean and a covariance matrix. The return
  value is a single value since this is a measure of distance from a point to
  a distribution.

                       T -1
  DM(X, M, E) = (X - M) E  (X - M)


  The mean can be a 2-D matrix and covariance a 3-D array of matricies
  """

  if len(vector.shape) == 1:
    vector = Numeric.reshape(vector, (1, len(vector)))

  if len(mean.shape) == 1:
    mean = Numeric.reshape(mean,   (1, len(mean)))

  if len(covariance.shape) == 2:
    covariance = Numeric.reshape(covariance, (1, covariance.shape[0],
                                              covariance.shape[1]))

  #
  # Build a list of inverse martricies.
  #
  
  for cov in covariance:
    inv = LinearAlgebra.inverse(cov)
    inv = Numeric.reshape(inv, (1, inv.shape[0], inv.shape[1]))
    try:
      Einverse = Numeric.concatenate((Einverse, inv))
    except:
      Einverse = inv

  XminusM  = mean - vector

  #
  # Do an element-by-element eval

  return map(Numeric.matrixmultiply,
             map(Numeric.matrixmultiply, XminusM, Einverse),
             XminusM)
  
def BhattacharyyaDistance(mean1, covariance1, mean2, covariance2):
  """
  distance value = BhattacharyyaDistance(mean1, covariance1,
                                         mean2, covariance2)

  computes the Bhattacharrya distance between two classes defined by
  a mean vector and their respective covariance matricies.

                       1         T        -1                 | E1 + E2 |  
  DM(M1, E1, M2, E2) = -(M1 - M2) (E1 + E2) (M1 - M2) + ln ---------------
                       4                                        1/2    1/2
                                                             |E1|   |E2|
  """

  E1E2    = (covariance1 + covariance2) / 2.0;
  M1M2    = mean1 - mean2
  detE1   = LinearAlgebra.determinant(covariance1)
  detE2   = LinearAlgebra.determinant(covariance2)
  detE1E2 = LinearAlgebra.determinant(E1E2)
  
  firstTerm = Numeric.matrixmultiply(M1M2, LinearAlgebra.inverse(E1E2))
  firstTerm = Numeric.matrixmultiply(firstTerm, M1M2)

  secondTerm = detE1E2 / (math.sqrt(detE1) * math.sqrt(detE2))
  secondTerm = math.log(secondTerm)
  
  return 0.25 * firstTerm + secondTerm
  
  
def Euclidean(vector1, vector2=None):
  """Compute the distance between two vectors,
  this version is slower than the functional version below.
  """
  if vector2 is None:
    v2 = Numeric.zeros(len(vector1), Numeric.Float)
  elif type(vector2) != Numeric.ArrayType:
    v2 = Numeric.array(vector2)
  else:
    v2 = vector2

  if type(vector1) != Numeric.ArrayType:
    v1 = Numeric.array(vector1)
  else:
    v1 = vector1

  if v1.shape != v2.shape:
    raise TypeError("vectors are of different lengths")
    
  return math.sqrt(Numeric.sum((Numeric.array(v1)-Numeric.array(v2)) ** 2))
  
def EuclideanList(vector1, vector2=None):
  """Compute the euclidean distance between two vectors.

  vector1 -- vector in list format.

  vector2 -- [optional] vectorn in list format. If not provided assume the
  origin.
  """
##  def nan_subtract(x, y):
##     if Double(x).isNaN() or Double(y).isNaN():
##       return 0
##     else:
##       return x-y
  if vector2 is None:
    vector2 = [0] * len(vector1)
  elif len(vector1) != len(vector2):
    raise TypeError("vectors are of different lengths")

  try:
    # Square root the result and return
    return math.sqrt(                        
      # Add everything together
      reduce(lambda x,y: x+y,                
             # square 'em
             map(lambda x: x**2,             
                 # Get differences applying an optional list filter
                 map(lambda x,y: x-y, vector1, vector2)
                 )
             )
   )
  except ValueError, e:
    print "Error in DistanceMetric.euclidean: ", e, vector1, vector2
    raise ValueError(e)
  
def FilteredEuclidean(vector1, vector2=None, filter_function=lambda x: 1):
  """Compute the euclidean distance between two vectors.

  vector1 -- vector in list format.

  vector2 -- [optional] vectorn in list format. If not provided assume the
  origin.

  filter_function -- [optional] function applied via the filter operation
  to mask out some elements from vector1 and vector2.
  """
##  def nan_subtract(x, y):
##     if Double(x).isNaN() or Double(y).isNaN():
##       return 0
##     else:
##       return x-y
  if vector2 == None:
    vector2 = [0] * len(vector1)
  elif len(vector1) != len(vector2):
    raise TypeError("vectors are of different lengths")

  try:
    # Square root the result and return
    return math.sqrt(                        
      # Add everything together
      reduce(lambda x,y: x+y,                
             # square 'em
             map(lambda x: x**2,             
                 # Get differences applying an optional list filter
                 map(lambda x,y: x-y, filter(filter_function, vector1),
                                      filter(filter_function, vector2)
                     )
                 )
             )
   )
  except ValueError, e:
    print "Error in DistanceMetric.euclidean: ", e, vector1, vector2
    raise ValueError(e)
  

def NormalizedEuclideanDistance(vector, array, covarianceMatrix):
  """
  NormalizedEuclideanDistance vector = EuclideanDistance(vector, array)

  Compute the Normalize Euclidean distance between the vector and each row of the array.
  For an m-by-n array and a 1-by-n vector, a 1-by-m vector containing the
  distances is returned.  

                   k
  De(X,Y) = sqrt( SUM (x - y )^2 /variance**2)
                  i=1   i   i
  """

  
  variance = MLab.diag(covarianceMatrix)
  return (Numeric.sqrt(Numeric.sum( Numeric.transpose( (((array - vector)**2)/variance) ))))