###########################################################################
#                                                                         #
# C O P Y R I G H T   N O T I C E                                         #
#  Copyright (c) 2005 by:                                                 #
#    * California Institute of Technology                                 #
#                                                                         #
#    All Rights Reserved.                                                 #
#                                                                         #
###########################################################################
#
#          Authors: Joe Roden, Brandon King & Diane Trout
# $LastChangedDate: 2005-12-12 16:31:43 -0800 (Mon, 12 Dec 2005) $
#        $Revision: 1411 $
#
__version__  = '1.2'
__revision__ = '$Rev: 1411 $'
__date__     = '$LastChangedDate: 2005-12-12 16:31:43 -0800 (Mon, 12 Dec 2005) $'

import copy
import math
import sets
import sys
import types

import matplotlib
import matplotlib.numerix as nx
try:
  import matplotlib.pylab as pylab
except ImportError, e:
  # try older version of matplotlib matlab(r)-like commands
  import matplotlib.matlab as pylab

# ttest from "stats.py" in pyNMS, so requires PYTHONPATH to ~/Software/pyNMS/lib
# see also python-stats package
#import stats

try:
  import rpy
except ImportError, e:
  print >>sys.stderr, "Warning: rpy not available, disabling pcaGinzu"
  
from compClust.mlx import datasets
from compClust.mlx import labelings
from compClust.mlx import views
from compClust.mlx.views import RowPCAView
from compClust.score.ConfusionMatrix2 import ConfusionMatrix

#Module globals
CUTOFF_MODE_N_OUTLIER = 'nOutlier'
CUTOFF_MODE_OUTLIER_CUTOFF = 'outlierCutoff'

#Exceptions
class OutlierCutoffError(Exception): pass

#matplotlib.use('TkAgg')

class pcaGinzu:

  # NOTE, if you change the parameter list here, you'll need to change it in the
  # compClust.iplot.IPlot* modules, compClust.mlx.pcaGinzu, and PCAGinzu
  def __init__(self, dataset, nOutliers=None, outlierCutoff=None,
               sigCutoff = 0.05, maxPCNum = None, 
               verbose = False, rowPCAView = None, makeLabelings = True):
    """
    Description: 

      Creates a basic pcaGinzu object given a dataset and optional parameters 
      that control the pcaGinzu analysis.

      The object holds the dataset, constructs (or is given) a RowPcaView on
      which subsequent analysis is based, and parameters, all the stuff that is
      necessary for a pcaGinzu analysis.

    Simplest Usage:
      p = pcaGinzu ( myDataset )

    See also:
      - the pcaGinzuVisualizeMatplotlib class in compClust.mlx.pcaGinzu
      - the PCAGinzu class in the compClust.iplot.PCAGinzu module

    Optional Arguments:

      Arguments controlling how "outliers" aka "extreme points" are selected.
      You should specify one of these two parameters.  If neither is specified,
      the default us outlierCutoff=0.001.  The two arguments you can adjust are:

      - outlierCutoff: The preferred method is to select extreme points that
                       are at the fringes of the distribution of values along
                       a particular principal component's axis.  We assume the
                       values are distributed roughly in a Gaussian shape, and
                       so estimate the likelihood of each point belonging to
                       that distribution.  Points having a likelihood less than
                       or equal to this cutoff (at either end of the axis) are
                       itentified as "outliers" aka "extreme points" for that 
                       principal component.  Default is outlierCutoff=0.001.

      - nOutliers:     The original method is to select an explicit number of
                       data points at each end of a principal component's axis
                       that will be labeled as "outliers" aka "extreme points".

    Additional optional arguments:
    
    - sigCutoff:     This is the significance level below which you reject the
                     hypothesis that the high outliers and low outliers are
                     drawn from the same distribution.  Wilcoxon test determines
                     the likelihood of this hypothesis for each condition, and
                     the conditions that meet this threshold are labeled 
                     "up" or "down" conditions.  Default = 0.05.

    - maxPCNum:      If you specify this, you can limit the number of principal
                     components to fewer than the number of columns.  This is
                     useful if you have a very high dimensional dataset and are
                     not interested in analyzing every last principal component.
                     The number is 1-origin, so if you say 3 you will produce
                     labelings for PC1, PC2 and PC3.  Default = the number of
                     columns in given dataset.

    - verbose:       If set to True, will output status messages to standard
                     output while sub-operations are taking place.
                     Default = False.

    - rowPCAView:    You can provide a RowPCAView for the existing dataset so 
                     a new rowPCAView will not be created, but the existing one 
                     will be used.  By default (none given) one will be created 
                     (unless there's already one named "Row PCA View" attached 
                     to the dataset).

    - makeLabelings: If False, the row & column labelings that pcaGinzu
                     normally creates at construction will not be performed.
                     This is useful to save computation & space in case you are
                     passing a dataset that already has the proper pca row and 
                     column labelings to the constructor.  More recently the
                     code can figure out if the labelings already exist and
                     avoid re-creating the labelings.

    Notes/ideas:
    - Assumes it's given a MLX dataset.  (should verify & error if not, or
      maybe create one !)
    - maxPCNum is passed as 1-origin, and is also stored internally as
      1-origin so that it matches nCols, this can be used directly in range.
    - Regarding makeLabelings, row and column labelings names are unique to the
      outlierCutoff or number of outliers; this permits users to explore
      different settings of outlierCutoff or nOutliers without throwing
      away previously computed row and column labelings.
    - We want to make the significances and mean outlier differences per
      column into column labelings so one routine computes them, and the
      other routines that need them just reference them. I've started this 
      by writing makeColumnLabelingForPCNOutlierMeanDiffs, but the other
      routines don't look up the resulting column labeling yet.  Consider
      doing the same for signfiicances as well.  Make sure the labeling
      names contain the number of outliers so exploring multiple nOutliers is
      easy & efficient.
    - Probably want to make makeRowLabelingsForPCOutliers loop over a function
      that computes the row labeling for one principal component (like 
      makeColumnLabelingsForSigGroups does), so that we can explore just a
      few principal components quickly without computing all of them.
    """
    if not sys.modules.has_key('rpy'):
      raise RuntimeError("rpy is not available, pcaGinzu will not work")
    
    self.verbose = verbose
    if self.verbose:
      print "Initializing pcaGinzu:"
    self.dataset = dataset
    dData = self.dataset.getData()
    self.maxOriginalData = max(max(dData))
    self.minOriginalData = min(min(dData))
    
    ####################################################
    # Outlier mode section                             #
    ####################################################
    #FIXME: Should this be factored out into a small function?
    
    self.nOutliers = nOutliers
    self.outlierCutoff = outlierCutoff
    
    #Default to outlierCutoff = 0.001 if no outlier controls are specified.
    if self.nOutliers is None and self.outlierCutoff is None:
      self.outlierCutoff = 0.001
    
    #Catch both methods being set
    if self.nOutliers is not None and self.outlierCutoff is not None:
      msg = 'Expecting nOutliers (%s) or outlierCuttoff (%s) to be set... Both set.' % (self.nOutliers, self.outlierCutoff)
      raise OutlierCutoffError, msg
    
    #nOutliersMode
    elif self.nOutliers is not None:
      self._outlierCutoffMode = CUTOFF_MODE_N_OUTLIER
      if self.verbose:
        print "  Extreme point method: nOutliers = %f" %self.nOutliers
    
    
    elif self.outlierCutoff is not None:
      self._outlierCutoffMode = CUTOFF_MODE_OUTLIER_CUTOFF
      if self.verbose:
        print "  Extreme point method: outlierCutoff = %f" %self.outlierCutoff
    
    else:
      msg = 'Something went really really wrong.'
      raise OutlierCutoffError, msg
    ####################################################
    
    if rowPCAView is not None:
      if self.verbose:
        print "  1. Using existing PCA view..."
      self.rowPCAView = rowPCAView
    else:
      # still we can check if one is already attached to this dataset
      if self.verbose:
        print "  1. Setting up RowPCA view..."
      self.rowPCAView = self.dataset.addViewDefault('RowPCAView', RowPCAView,
                                                    self.dataset)

    self.sigCutoff = sigCutoff
    self.nRows = self.rowPCAView.getNumRows()
    self.nCols = self.rowPCAView.getNumCols()

    if maxPCNum is not None:
      self.maxPCNum = maxPCNum
    else:
      self.maxPCNum = self.nCols

    if makeLabelings:
      if self.verbose:
        print "  2. Setting up row labelings for outliers..."
      self.makeRowLabelingsForPCOutliers()

      if self.verbose:
        print "  3. Setting up column labelings for significant groups..."
      self.makeColumnLabelingsForSigGroups()

    if self.verbose:
      print "  Done initializing pcaGinzu."

  def getOutputForPCNOutliers(self, pcNum, labelingNameList=[]):
    """
    Returns a 2D string array describing each of the high & low outliers
    for the given pcNum.  The rows contain first the high then the low
    outliers (one row per outlier), and the columns contain the values from 
    the labelings in the labelingNameList (one column per row labeling name
    provided, e.g. a 'ProbeID' column, a 'Description' column, etc.).
    etc.  The very first row contains column headings (could make this
    optional in the future, controlled by a flag).  The very first column
    contains the value of the "PC-<N> <nOutliers> High/Low" labeling, 
    (either "high" or "low").  (Again, could be optional & controlled by a
    flag in the future).

    You can use write2DStringArrayToFile if you want this output to a file.
    
    :Parameters:
      -`pcNum`: which principal compoent to return, 1-origin, e.g. 1,2,...,max
      -`labelingNameList`: the list of row labeling names (or labels) to return
                           results for
    """
    #labelingName = 'PC-%d %d High/Low' % (pcNum, self.nOutliers)
    labeling = self._getHighLowLabelingByPCN(pcNum)
    if labeling is None:
      self.makeRowLabelingsForPCOutliers()
      labeling = self._getHighLowLabelingByPCN(pcNum)

    #Output (list of lists)
    outputDict = {}

    #Get all indices for this labeling
    indices = []
    highRows = labeling.getRowsByLabel('high')
    lowRows  = labeling.getRowsByLabel('low')
    indices.extend(highRows)
    indices.extend(lowRows)

    #Prep outputDict
    outputDict['header'] = []
    for index in indices:
      outputDict[index] = []  

    #Process each label in labelingNameList
    for labelName in labelingNameList:
      # if we're a labeling don't bother trying to do a lookup 
      if isinstance(labelName, labelings.Labeling):
        rowLabeling = labelName
      else:
        rowLabeling = self.dataset.getLabeling(labelName)
        if rowLabeling is None:
          raise ValueError("<%s> was not found in the labels attached to <%s>" \
                           % (labelName, self.dataset.getName()))
      rowLabels = rowLabeling.getAllRowLabels()

      #Add header for column
      outputDict['header'].append(labelName)

      #Get label for this labeling for each of the labels from this
      # PC dimension.
      for index in indices:
        label = rowLabeling.getLabelByKey(index)
        outputDict[index].append(label)

    #Prep final output
    output = []

    #Prepare header
    header = []
    header.append(labeling.getName())
    header.append('PC-%d Value' % pcNum)
    header.extend(outputDict['header'])

    #Prepare data
    for index in indices:
      row = []
      row.append(labeling.getLabelByRow(index))
      pcaRowData = self.rowPCAView.getData(index)
      row.append(pcaRowData[pcNum-1])
      row.extend(outputDict[index])
      output.append(row)

    # sort output by 2nd column, the PC-N value for each row
    output.sort(secondColumnCompare)
    output.insert(0,header)

    return output

  def getOutputForSigGroups(self, pcNum, labelingNameList=[]):
    """
    Returns a 2D string array describing each of the columns of the dataset.
    Rows of the output describe each column in the dataset, (one row per
    dataset column) and the output rows are sorted by decreasing difference
    of high means and low means, i.e. the value
      mean(high outliers) - mean(low outliers)
    computed per column of the dataset.  The columns of the output contain
    annotations describing the dataset columns, one column per column
    labeling name provided.  The first row is a header line (could be
    optional). The first column is the value of the
    "PC-<N> <M>-outlier Up/Flat/Down" labeling, either "up", "flat" or "down".
    (could also be optional).

    To make interpretation of results easiest, this output row ordering is
    meant to be the same as the plot X axis ordering of the appropriate
    outlier-trajectory plot, namely plotPCNOutlierRowsInSigGroupOrder
    Possibly need to make two (plot & output) routines or options to them
    that control the ordering method.
    
    Note: This output rows are approximately sorted as up/flat/down, because
    softing by decreasing mean difference approximates that.  In the future 
    we might want/need to produce output for all "up" first, then all "flat", 
    then all "down", and within group sort by decreasing mean difference.

    You can use write2DStringArrayToFile if you want this output to a file.
    
    :Parameters:
      -`pcNum`: which principal compoent to return
      -`labelingNameList`: the list of column labeling names (or labels) to 
                           return results for
    """
    l = self._getHighLowLabelingByPCN(pcNum)
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    sigGroupLabeling = self._getUpFlatDownLabelingByPCN(pcNum)
    upcols   = sigGroupLabeling.getColsByLabel('up')
    flatcols = sigGroupLabeling.getColsByLabel('flat')
    downcols = sigGroupLabeling.getColsByLabel('down')

    data = self.dataset.getData()

    if len(highrows) > 0:
      highdata = pylab.zeros((len(highrows),self.nCols),'d')
      for i in range(len(highrows)):
        highdata[i,:] = data[highrows[i],:]
      highMeans = nx.average(highdata,0)
    else:
      highMeans = pylab.zeros((1,self.nCols),'d')[0]

    if len(lowrows) > 0:
      lowdata = pylab.zeros((len(lowrows),self.nCols),'d')
      for i in range(len(lowrows)):
        lowdata[i,:] = data[lowrows[i],:]
      lowMeans = nx.average(lowdata, 0)
    else:
      lowMeans = pylab.zeros((1,self.nCols),'d')[0]
    
    meanDiffs = nx.subtract(highMeans,lowMeans);
    
    meansort = pylab.sort(meanDiffs,0)
    # want to output columns in descending order by mean differences
    meansort = pylab.fliplr([pylab.sort(meansort)])[0]

    sortedIndices = []

    for i in range(0,self.nCols):
      currcond = pylab.find(meanDiffs == meansort[i])
      index = currcond[0]
      sortedIndices.append(index)

    #Output (list of lists)
    outputDict = {}

    #Prep outputDict
    outputDict['header'] = []
    for index in sortedIndices:
      outputDict[index] = []  

    #Process each label in labelingNameList
    for labelName in labelingNameList:
      # if we're a labeling don't bother trying to do a lookup 
      if isinstance(labelName, labelings.Labeling):
        colLabel = labelName
      else:
        colLabeling = self.dataset.getLabeling(labelName)

      #Add header for column
      outputDict['header'].append(labelName)

      #Get label for this labeling for each of the labels from this
      # PC dimension.
      for index in sortedIndices:
        label = colLabeling.getLabelByCol(index)
        outputDict[index].append(label)

    #Prep final output
    output = []

    #Prepare header
    header = []
    header.append(sigGroupLabeling.getName())
    header.extend(outputDict['header'])
    output.append(header)

    #Prepare data
    for index in sortedIndices:
      row = []
      row.append(sigGroupLabeling.getLabelByCol(index))
      row.extend(outputDict[index])
      output.append(row)

    return output


  def makeRowLabelingsForPCOutliers(self):
    """
    Creates all row labelings (or 1 thru maxPCNum) sequentially that
    describe the high & low outliers for each principal component.
    The number of outliers is controlled by the pcaGinzu object's
    nOutliers value, which is typically set by the pcaGinzu
    constructor.
    """
    for pcaDim in range(self.maxPCNum):

      if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
        labelingName = 'PC-%d %d High/Low' % (pcaDim+1, self.nOutliers)
      elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
        labelingName = 'PC-%d p<%f High/Low' % (pcaDim+1, self.outlierCutoff)
      else:
        msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
        raise OutlierCutoffError, msg  
        
      if self.dataset.getLabeling(labelingName) is not None:
        if self.verbose:
          print '     - Row labeling %s already exists; skipping' % labelingName
        continue
      if self.verbose:
        print '     - Creating row labeling: %s' % labelingName

      label = labelings.GlobalLabeling(self.rowPCAView, labelingName)

      pcaData = self.rowPCAView.getData()
      pcaCol = [ line[pcaDim] for line in pcaData ]
      
      #OUTLIER CUTOFF MODE
      if self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
        mu = rpy.r.mean(pcaCol)
        stdev = rpy.sqrt(rpy.r.var(pcaCol))
        probabilities = nx.array(rpy.r.pnorm(pcaCol,mu,stdev))
  
        lowIndices  = pylab.find(probabilities<=self.outlierCutoff)
        highIndices = pylab.find(1-probabilities<=self.outlierCutoff)
      
      #N OUTLIER CUTOFF MODE
      elif self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
        value_rowkey_list = zip(pcaCol, self.rowPCAView.getRowKeys())
        value_rowkey_list.sort()
        lowIndices = [ key for value,key in value_rowkey_list[:self.nOutliers]]
        highIndices = [ key for value,key in value_rowkey_list[-self.nOutliers:]]

        assert len(lowIndices) <= self.nOutliers                
        assert len(highIndices) <= self.nOutliers                

      if len(highIndices)>0:
        label.addLabelToKeys(self.rowPCAView, 'high', highIndices)

      if len(lowIndices)>0:
        label.addLabelToKeys(self.rowPCAView, 'low',  lowIndices)
        

  def makeColumnLabelingsForSigGroups(self):
    """
    Call makeColumnLabelingForPCNSigGroups once for each principal component.
    In the future, might be nice to make this work with an optional input
    range.
    """
    for pcaDim in range(self.maxPCNum):
      self.makeColumnLabelingForPCNSigGroups(pcaDim+1)

  def makeColumnLabelingForPCNSigGroups(self,pcNum):
    """
    Create one labeling of the columns given a principal component number 
    (1-based, e.g. pc 1, pc 2, ...).  A new column labeling named
    'PC-<N> <M>-outlier Up/Flat/Down' assign values 'up','flat', and 'down'
    to each column as follows:

      up   = the PC-N high outliers for this column are significantly higher
             than the PC-N low outliers

      flat = the PC-N high outliers for this column are not significantly
             different than PC-N low outliers

      down = the PC-N high outliers for this column are significantly lower
             than PC-N low outliers
    """
    
    newLabelingName = self._getUpFlatDownLabelingNameByPCN(pcNum)
                        
    if self.dataset.getLabeling(newLabelingName) is not None:
      if self.verbose:
        print '     - Column labeling %s already exists; skipping' \
              % newLabelingName
      return
    if self.verbose:
      print '     - Creating column labeling: %s' % newLabelingName

    label = labelings.GlobalLabeling(self.rowPCAView, newLabelingName)

    l = self._getHighLowLabelingByPCN(pcNum)
    
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    # we could be missing either high or low outliers, so be careful
    # when calculating statistics per column
    data = self.dataset.getData()
    
    if len(highrows) > 0:
      highdata = pylab.zeros((len(highrows),self.nCols),'d')
      for i in range(len(highrows)):
        highdata[i,:] = data[highrows[i],:]
      highMeans = nx.average(highdata,0)
    else:
      highMeans = pylab.zeros((1,self.nCols),'d')[0]

    if len(lowrows) > 0:
      lowdata = pylab.zeros((len(lowrows),self.nCols),'d')
      for i in range(len(lowrows)):
        lowdata[i,:] = data[lowrows[i],:]
      lowMeans = nx.average(lowdata, 0)
    else:
      lowMeans = pylab.zeros((1,self.nCols),'d')[0]

    # the wilcox test spews a bunch of warning messages, so lets ignore them
    saved_rpy_opts = rpy.r.options(warn=-1)
    
    ttestData = pylab.ones(self.nCols,'d') # init to 1's, not signif. by default
    if len(lowrows)>0 and len(highrows)>0:
      for i in range(0,self.nCols):
        rResult = rpy.r.wilcox_test(lowdata[:,i],highdata[:,i])
        ttestData[i] = rResult['p.value']
    rpy.r.options(**saved_rpy_opts)
    
    meanDiffs = nx.subtract(highMeans,lowMeans)
  
    ups   = pylab.find(meanDiffs >= 0)
    downs = pylab.find(meanDiffs <  0)
    sigs  = pylab.find(ttestData <= self.sigCutoff)
    
    upSig   = pylab.find(nx.logical_and((meanDiffs >= 0),(ttestData <= self.sigCutoff)))
    downSig = pylab.find(nx.logical_and((meanDiffs <  0),(ttestData <= self.sigCutoff)))
      
    flat    = pylab.find(ttestData > self.sigCutoff)

    label.addLabelToCols(self.rowPCAView, 'up',    upSig)
    label.addLabelToCols(self.rowPCAView, 'flat',  flat)
    label.addLabelToCols(self.rowPCAView, 'down',  downSig)
 
  def makeColumnLabelingForPCNOutlierMeanDiffs(self, pcNum):
    """
    Creates a labeling for this dataset that contains the mean difference 
    of high outliers vs. low outliers, i.e. calculates and saves the value 
      mean(high outliers) - mean(low outliers)
    Into a labeling named "PC-<N> <M>-outlier Mean Differences".  

    Note: In a fit of refactoring I got the idea that it is smarter
    to calculate this info once and attach it as a labeling so that other 
    pcaGinzu functions can just look up this info, rather than compute these
    values again.  However, the other routines have not been changed to call
    this yet.  *** ALSO, should do the same for t-test significance!
    """
    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
      newLabelingName = 'PC-%d %d-outlier Mean Differences' % \
                        (pcNum, self.nOutliers)
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      newLabelingName = 'PC-%d %f<p Mean Differences' % \
                        (pcNum, self.outlierCutoff)
      
    if self.dataset.getLabeling(newLabelingName) is not None:
      if self.verbose:
        print '     - Column labeling %s already exists; skipping' \
              % (newLabelingName)
      return
    if self.verbose:
      print '     - Creating column labeling: %s' % newLabelingName


    l = self._getHighLowLabelingByPCN(pcNum)
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    c = self._getUpFlatDownLabelingByPCN(pcNum)
    upcols   = c.getColsByLabel('up')
    flatcols = c.getColsByLabel('flat')
    downcols = c.getColsByLabel('down')

    data = self.dataset.getData()

    if len(highrows) > 0:
      highdata = pylab.zeros((len(highrows),self.nCols),'d')
      for i in range(len(highrows)):
        highdata[i,:] = data[highrows[i],:]
      highMeans = nx.average(highdata,0)
    else:
      highMeans = pylab.zeros((1,self.nCols),'d')[0]

    if len(lowrows) > 0:
      lowdata = pylab.zeros((len(lowrows),self.nCols),'d')
      for i in range(len(lowrows)):
        lowdata[i,:] = data[lowrows[i],:]
      lowMeans = nx.average(lowdata, 0)
    else:
      lowMeans = pylab.zeros((1,self.nCols),'d')[0]
    
    meanDiffs = nx.subtract(highMeans,lowMeans);

    label = labelings.GlobalLabeling(self.dataset, newLabelingName)
    for i in range(self.nCols):
      label.addLabelToCol(self.rowPCAView, meanDiffs[i], i)


  def scoreColumnLabelingsForPCN(self,pcNum, minSetSize=2, verbose=False):
    """
    For each column labeling (except the Up/Flat/Down labelings) compute 
    the NAMI value for that column labeling's partitioning vs. this pcNum's
    implied Up/Flat/Down column partitioning.
    
    Parameters:
      - `pcNum`: specifies which principal component to generate score fore
      - `minSetSize`: specifies what the minimum size the two sets used to 
                      compute the scores needs to be. For instance minSetSize=3 
                      means that in an up vs flat comparison, both the up and 
                      flat sets would need to have at least 3 members in order
                      to make a comparison).
    """
    class ColumnScore:
      def __init__(self):
        self.upcols = None
        self.flatcols = None
        self.downcols = None
        self.is_discrete = None
        self.labeling = None
        self.scores = None
        
      def min_score(self):
        """Continuous have 3 scores, discrete only have 1 score, this returns 
        the min to make sorting easier"""
        if self.is_discrete:
          return self.scores
        else:
          return min(self.scores)

      def get_vals(self, columns):
        """Return the values for a specified column list (e.g. up, flat, down)
        """
        vals = self.labeling.getLabelsByCols(columns)
        vals = [ x[0] for x in vals if x ]
        return vals        
    
    #Get ufd labeling for PCN based on cutoff mode
    ufd = self._getUpFlatDownLabelingByPCN(pcNum)
    
    if ufd == []:
      msg = 'ERROR: no Up/Flat/Down labeling for PC number ' + pcNum
      raise ValueError, msg

    scores = []
    columnLabelings = []
    allLabelings = self.dataset.getLabelings()

#    print len(allLabelings)
#    print "find col labels"
    for l in allLabelings:
#      print 'checking: '
#      print l
      if l.getName().find('Up/Flat/Down') == -1 and l.isColLabeling():
        columnLabelings.append(l)
#        print '*** found: ' 
#        print l
#    print "found col labels"
    for l in columnLabelings:
      column_score = ColumnScore()
      column_score.labeling = l
      if l.isNumeric():
        column_score.is_discrete = False
        print 'column labeling ' + l.getName() + ' is numeric'
        column_score.upcols   = ufd.getColsByLabel('up')
        column_score.flatcols = ufd.getColsByLabel('flat')
        column_score.downcols = ufd.getColsByLabel('down')

        upVsFlat   = 1.0
        upVsDown   = 1.0
        flatVsDown = 1.0

        #Set the default to be empty, rather than undefined.
        upvals = []
        flatvals = []
        downvals = []

        # Warning, 2 argument version assumes all labels are GlobalLabelings
        if len(column_score.upcols)>0:
          upvals   = l.getLabelsByCols(column_score.upcols)
          upvals   = [ x[0] for x in upvals if x]
        if len(column_score.flatcols)>0:
          flatvals = l.getLabelsByCols(column_score.flatcols)
          flatvals = [ x[0] for x in flatvals if x ]
        if len(column_score.downcols)>0:
          downvals = l.getLabelsByCols(column_score.downcols)
          downvals = [ x[0] for x in downvals if x ]

        # the wilcox test makes a bunch of warning messages, so lets supress them
        saved_rpy_opts = rpy.r.options(warn=-1)
        # we should not test for significance if we only have 1 data
        # value to compare
        if len(upvals)>=minSetSize and len(flatvals)>=minSetSize:
          rResult  = rpy.r.wilcox_test(upvals,flatvals)
          upVsFlat   = rResult['p.value']
        else:
          upVsFlat = 2
        if len(upvals)>=minSetSize and len(downvals)>=minSetSize:
          rResult  = rpy.r.wilcox_test(upvals,downvals)
          upVsDown   = rResult['p.value']
        else:
          upVsDown = 2
        if len(flatvals)>=minSetSize and len(downvals)>=minSetSize:
          rResult  = rpy.r.wilcox_test(flatvals,downvals)
          flatVsDown = rResult['p.value']
        else:
          flatVsDown = 2
        if verbose:
          print '3 scores are: '+str(upVsFlat) + ',' + str(upVsDown) + \
                ',' + str(flatVsDown)
        column_score.scores = (upVsFlat,upVsDown,flatVsDown)
        rpy.r.options(**saved_rpy_opts)
        
      else:
        column_score.is_discrete=True
        if verbose:
          print 'column labeling ' + l.getName() + ' is discrete'
        cm = ConfusionMatrix([ufd,l])
        column_score.scores = cm.averageNMI()
        if verbose:
          print 'score is: '+str(column_score.scores)
      scores.append(column_score)
      
    return scores


  def getPCsForOutlierRow(self, rowNumber):
    """
    If you have a row of interest, get a list of principal components
    that are either high or low outliers for that principal component.
    """
    result = []
    for pcNum in range(self.maxPCNum):
      l = self._getHighLowLabelingByPCN(pcNum+1)
      if l is not None:
        label = l.getLabelByRow(rowNumber)
        if label == 'high' or label == 'low':
          result.append(pcNum+1)
    return result 


  def _getHighLowLabelingByPCN(self, pcNum):
    """
    Returns high/low labeling given a pcNum.
    
      NOTE: Takes into account what Outlier Cuttoff Mode was
      choosen at the time the pcaGinzu object was created.
      
      DEBUG NOTE: Mode stored in self._outlierCutoffMode
    """
    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
      return self.dataset.getLabeling('PC-%d %d High/Low' % (pcNum, self.nOutliers))
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      return self.dataset.getLabeling('PC-%d p<%f High/Low' % (pcNum, self.outlierCutoff))
    else:
      msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
      raise OutlierCutoffError, msg
    
    
  def _getHighLowLabelingNameByPCN(self, pcNum):
    """
    Returns high/low labeling name given a pcNum.
    
      NOTE: Takes into account what Outlier Cuttoff Mode was
      choosen at the time the pcaGinzu object was created.
      
      DEBUG NOTE: Mode stored in self._outlierCutoffMode
    """
    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
      return 'PC-%d %d High/Low' % (pcNum, self.nOutliers)
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      return 'PC-%d p<%f High/Low' % (pcNum, self.outlierCutoff)
    else:
      msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
      raise OutlierCutoffError, msg


  def _getUpFlatDownLabelingByPCN(self, pcNum):
    """
    Returns up/flat/down labeling given a pcNum.
    
      NOTE: Takes into account what Outlier Cuttoff Mode was
      choosen at the time the pcaGinzu object was created.
      
      DEBUG NOTE: Mode stored in self._outlierCutoffMode
    """
    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:    
      return self.dataset.getLabeling('PC-%d %d-outlier Up/Flat/Down Columns' \
                                     % (pcNum, self.nOutliers))
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      return self.dataset.getLabeling('PC-%d p<%f Up/Flat/Down Columns' \
                                     % (pcNum, self.outlierCutoff))
    else:
      msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
      raise OutlierCutoffError, msg

  def _getUpFlatDownLabelingNameByPCN(self, pcNum):
    """
    Returns up/flat/down labeling name given a pcNum.
    
      NOTE: Takes into account what Outlier Cuttoff Mode was
      choosen at the time the pcaGinzu object was created.
      
      DEBUG NOTE: Mode stored in self._outlierCutoffMode
    """
    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:    
      return 'PC-%d %d-outlier Up/Flat/Down Columns' \
                      % (pcNum, self.nOutliers)
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      return 'PC-%d p<%f Up/Flat/Down Columns' \
                                     % (pcNum, self.outlierCutoff)
    else:
      msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
      raise OutlierCutoffError, msg


class pcaGinzuVisualizeMatplotlib(pcaGinzu):
  """
  pcaGinzuVisualizeMatplotlib uses pcaGinzu to construct all the necessary
  labelings and then constructs non-interactive matplotlib plots showing the
  various outlier representations.  The generateResults function calls most of
  the other output plot functions herein to dump all a complete set of results
  to files, and so is useful in a batch mode to get all PCA results for data
  analysis, or to generate graphics for publication.

    Simplest Usage:
      p = pcaGinzu ( myDataset )
      p.generateResults ( ['ProbeId','Description'], ['Samples'] )

  """
  def plotPercentageVarianceExplained(self):
    """
    Return a handle to a new figure that shows the percentage of variance
    that each eigenvector explains.  This is just a percentage of the sum
    of the eigenvalues. The Row PCA view already holds the variances, which
    sum to 1, so inside this function they are simply converted to percentages.
    """
    f = pylab.figure()
    pylab.clf()
    eigenvalues = self.rowPCAView._RowPCAView__variances
    percentages = eigenvalues * 100

    cnos = range(1,self.nCols+1)
    
    pylab.plot(cnos,percentages,'b-o')
    pylab.xlim(0,self.nCols+1)
    pylab.xticks(cnos)
    pylab.grid('on')

    pylab.title('Variance Explained by Each Principal Component')
    pylab.xlabel('Principal Component Number')
    pylab.ylabel('Variance (percentage)')

    return f

  def plotPCNEigenvectorInOriginalColumnOrder(self, pcNum,
                                              conditionLabels=None):
    """
    Return a handle to a new figure that shows a trajectory plot for the
    PC-<N> eigenvector.  The x-axis is ordered as the columns in the
    dataset are... original column order.

    You may optionally provide conditionLabels, a list of strings that
    will be used as the x tick labels; by default, the x-axis will be
    labeled with a simple range of integers.
    """
    f = pylab.figure()
    pylab.clf()
    eigenvector = self.rowPCAView.matrix[pcNum-1,:]
    cnos = range(1,self.nCols+1)
    
#    if conditionLabels is not None:
#      pylab.plot(cnos,eigenvector,'b-o')
#    else:
#      pylab.plot(cnos,eigenvector,'b-o')
    pylab.plot(cnos,eigenvector,'b-o')

    pylab.xlim(0,self.nCols+1)

    pylab.xlabel('%d Conditions in Original Order' % (self.nCols))
    pylab.ylabel('eigenvector magnitude')

    if conditionLabels is not None:
#      labels = copy.copy(conditionLabels)
#      labels.insert(0,'')
#      pylab.xticks(pylab.arange(self.nCols+1),labels)
      conditionLabels = [str(i) for i in conditionLabels]
      pylab.xticks(cnos,conditionLabels)
      
    return f

  def plotPCNOutlierRowsInOriginalColumnOrder(self, pcNum,
                                              conditionLabels=None):
    """
    Return a handle to a new figure that shows a trajectory plot for the
    high and low outliers.  The x-axis is ordered as the columns in the
    dataset are... original column order.

    You may optionally provide conditionLabels, a list of strings that
    will be used as the x tick labels; by default, the x-axis will be
    labeled with a simple range of integers.  The list must be the same
    length as the conditions/dimensions.
    """
    if conditionLabels is not None:
      if len(conditionLabels) != self.ncols:
        msg = 'ERROR: conditionLabels list must be same size as data columns'
        raise ValueError, msg
      conditionLabels = [str(i) for i in conditionLabels]
    
    l = self._getHighLowLabelingByPCN(pcNum)
    
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    data = self.dataset.getData()
    cnos = range(1,self.nCols+1)
    
    highdata = pylab.zeros((len(highrows),self.dataset.numCols),'d')
    for i in range(0,len(highrows)):
      highdata[i,:] = data[highrows[i],:]
    
    lowdata = pylab.zeros((len(lowrows),self.dataset.numCols),'d')
    for i in range(0,len(lowrows)):
      lowdata[i,:] = data[lowrows[i],:]
    
    f = pylab.figure()
    pylab.clf()

#    if conditionLabels is not None:
#      for i in range(0,len(highrows)):
#        pylab.plot(cnos, highdata[i,:],'r-o')
#      for i in range(0,len(lowrows)):
#        pylab.plot(cnos, lowdata[i,:],'b-o')
#    else:
#      for i in range(0,len(highrows)):
#        pylab.plot(highdata[i,:],'r-o')
#      for i in range(0,len(lowrows)):
#        pylab.plot(lowdata[i,:],'b-o')
    for i in range(0,len(highrows)):
      pylab.plot(cnos, highdata[i,:],'r-',linewidth=0.3)
    for i in range(0,len(lowrows)):
      pylab.plot(cnos, lowdata[i,:],'b-',linewidth=0.3)

    pylab.xlim(0,self.nCols+1)

    pylab.xlabel('%d Conditions in Original Order' % (self.nCols))
    pylab.ylabel('Expression, log2(signal)')

    if conditionLabels is not None:
#      labels = copy.copy(conditionLabels)
#      labels.insert(0,'')
#      pylab.xticks(pylab.arange(self.nCols+1),labels)
      pylab.xticks(cnos,conditionLabels)
      
    return f


  def plotPCNOutlierRowsInSigGroupOrder(self, pcNum, conditionLabels=None):
    """
    Create and return a matlplotlib figure containing a plot of the
    outlier trajectories across conditions/dimensions where the
    conditions are reordered based (approximately) on significance of
    high vs. low.

    You may optionally provide conditionLabels, a list of strings that
    will be used as the x tick labels; by default, the x-axis will be
    labeled with a simple range of integers.  The list must be the same
    length as the conditions/dimensions.

    Note: Presently we order conditions/dimensions by mean diff, which is
    approximately ordered by significance of difference, but we SHOULD
    make this more precisely partitioned first into Up/Flat/Down, and
    then within group ordered by mean difference.  This ordering needs
    to correspond to the order of rows output by getOutputForSigGroups.
    """
    if conditionLabels is not None:
      if len(conditionLabels) != self.nCols:
        msg = 'ERROR: conditionLabels list must be same size as data columns'
        raise ValueError, msg
      conditionLabels = [str(i) for i in conditionLabels]

    #l = self.dataset.getLabeling('PC-%d %d High/Low' % (pcNum, self.nOutliers))
    l = self._getHighLowLabelingByPCN(pcNum)
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    # print high & low set sizes as an aid to interpretation
    if self.verbose:
      print "%d\t%d\t%d" %(pcNum,len(highrows),len(lowrows))

    c = self._getUpFlatDownLabelingByPCN(pcNum)
    upcols   = c.getColsByLabel('up')
    flatcols = c.getColsByLabel('flat')
    downcols = c.getColsByLabel('down')

    data = self.dataset.getData()
    cnos = range(1,self.nCols+1)
    if len(highrows) > 0:
      highdata = pylab.zeros((len(highrows),self.nCols),'d')
      for i in range(len(highrows)):
        highdata[i,:] = data[highrows[i],:]
      highMeans = nx.average(highdata,0)
    else:
      highMeans = pylab.zeros((1,self.nCols),'d')[0]

    if len(lowrows) > 0:
      lowdata = pylab.zeros((len(lowrows),self.nCols),'d')
      for i in range(len(lowrows)):
        lowdata[i,:] = data[lowrows[i],:]
      lowMeans = nx.average(lowdata, 0)
    else:
      lowMeans = pylab.zeros((1,self.nCols),'d')[0]
    
    meanDiffs = nx.subtract(highMeans,lowMeans);
    
    meansort = pylab.sort(meanDiffs,0)
    # want to plot in descending order by mean differences
    meansort = pylab.fliplr([pylab.sort(meansort)])[0]
    lowsort  = pylab.zeros((len(lowrows), self.nCols),'d')
    highsort = pylab.zeros((len(highrows),self.nCols),'d')

    sortedLabels = []
    for i in range(0,self.nCols):
      currcond = pylab.find(meanDiffs == meansort[i])
      index = currcond[0]
      if len(lowrows) > 0:
        lowsort[:,i]  = lowdata[:,index]
      if len(highrows) > 0:
        highsort[:,i] = highdata[:,index]
      if conditionLabels is not None:
        sortedLabels.append(conditionLabels[index])
    
    f = pylab.figure()
    pylab.clf()

## used to plot cnos, but probably don't need them to be explicit
## Now we plot without cnos whether labels will be added or not

# No, in fact, you NEED to make the x axis explicit for small sets to
# the x values range from 1->numcols rather than 0->numcols-1
# And we do this whether we are adding labels or not
    for i in range(len(highrows)):
      pylab.plot(cnos, highsort[i,:],'r-',linewidth=0.3)
    for i in range(len(lowrows)):
      pylab.plot(cnos, lowsort[i,:],'b-',linewidth=0.3)

    pylab.xlim(0,self.nCols+1)

    pylab.xlabel('%d Conditions Ordered by Mean Difference' % (self.nCols))
    pylab.ylabel('Expression, log2(signal)')
    
    if len(sortedLabels)>0:
#      sortedLabels.insert(0,'')
      pylab.xticks(cnos,sortedLabels)
#      pylab.xticks(pylab.arange(self.nCols+1),sortedLabels)

    return f

  def plotPCvsPCWithOutliersInY(self, pcNumForXAxis, pcNumForYAxis):
    """
    Create and return a matplotlib figure containing a scatter plot of
    data points in the dataset when projected onto one principal component
    versus another.  PC numbers passed in need to be 1-origin. 
    """
    l = self._getHighLowLabelingByPCN(pcNumForYAxis)
    highrows = l.getRowsByLabel('high')
    lowrows = l.getRowsByLabel('low')

    f = pylab.figure()
    pylab.clf()

    pcaData = self.rowPCAView.getData()
    pylab.plot(pcaData[:,pcNumForXAxis-1],pcaData[:,pcNumForYAxis-1],'k.',
               markersize=3)

    for highOutlier in highrows:
      pylab.plot([pcaData[highOutlier,pcNumForXAxis-1]],
                 [pcaData[highOutlier,pcNumForYAxis-1]],'ro',markersize=5)

    for lowOutlier in lowrows:
      pylab.plot([pcaData[lowOutlier,pcNumForXAxis-1]],
                 [pcaData[lowOutlier,pcNumForYAxis-1]],'bo',markersize=5)

    if self._outlierCutoffMode == CUTOFF_MODE_N_OUTLIER:
      pylab.title('PC %d: %d Highest and %d Lowest Extreme Genes' % \
                  (pcNumForYAxis, self.nOutliers, self.nOutliers))
    elif self._outlierCutoffMode == CUTOFF_MODE_OUTLIER_CUTOFF:
      pylab.title('PC %d: p<%f Highest and Lowest Extreme Genes' % \
                  (pcNumForYAxis, self.outlierCutoff))
    else:
      msg = 'OutlierCutoffMode of "%s" unknown' % (self._outlierCutoffMode)
      raise OutlierCutoffError, msg
    
    percentages = self.rowPCAView._RowPCAView__variances * 100;
    pylab.xlabel('PC %d (%4.2f%%)' %(pcNumForXAxis,percentages[pcNumForXAxis-1]))
    pylab.ylabel('PC %d (%4.2f%%)' %(pcNumForYAxis,percentages[pcNumForYAxis-1]))

    try:
      pylab.axis('scaled')
    except:
      print 'plotPCvsPCWithOutliersInY: tried, but cannot scale axes properly'
      pass

    return f


  def generateResults(self,rowLabelingNames,colLabelingNames,
                      conditionLabels=None,pcNumList=None):
    """
    For each principal component number, call the major result generating
    functions and save those results to appropriately-named files.

    rowLabelingNames is a list of row labeling names to include as columns in
    outputing high/low outlier lists, e.g. a 'ProbeID' column, a 'Description'
    column, etc.

    colLabelingNames is a list of column labeling names to include as columns in
    outputing up/flat/down condition lists, e.g. a 'PatientID' column, a 
    'Diagnosis' column, etc.

    You may optionally provide conditionLabels, a list of strings that
    will be used as the x tick labels in trajectory plots; by default, the 
    x-axis will be labeled with a simple range of integers.  The list must be 
    the same length as the conditions/dimensions.

    By default the results will be generated for all principal components,
    1,2,3,...,maxPCNum.  Users can specify an optional pcNumList to generate 
    results for specific principal components.  The principal component 
    numbers are 1-origin, as users expect... 1,2,3,...,pcN.

    (Note: This function DOES generate summary plots, e.g. percentage
    variance explained plots.)
    """
    f = self.plotPercentageVarianceExplained()
    pylab.savefig('percentage-variance-explained')
      
    # Within this function pcNum is 1-origin to match how users would call 
    # the various user-level functions
    if pcNumList is None:
      pcNumList = range(1,self.maxPCNum+1)
    if type(pcNumList) is not types.ListType:
      pcNumList = [pcNumList]
    for pcNum in pcNumList:
      self.generateResultsForPCN(pcNum,rowLabelingNames,colLabelingNames,
                                 conditionLabels)


  def generateResultsForPCN(self,pcNum,rowLabelingNames,colLabelingNames,
                            conditionLabels=None):
    """
    For the given principal component, call the major result generating
    functions and save those results to appropriately-named files.
    Note that pcNum is 1-origin, as users expect... 1,2,3,...,pcN.

    rowLabelingNames is a list of row labeling names to include as columns in
    outputing high/low outlier lists, e.g. a 'ProbeID' column, a 'Description'
    column, etc.

    colLabelingNames is a list of column labeling names to include as columns in
    outputing up/flat/down condition lists, e.g. a 'PatientID' column, a 
    'Diagnosis' column, etc.

    You may optionally provide conditionLabels, a list of strings that
    will be used as the x tick labels in trajectory plots; by default, the 
    x-axis will be labeled with a simple range of integers.  The list must be 
    the same length as the conditions/dimensions.

    (Note: function does NOT generate any summary plots, e.g. percentage
    variance explained plots.)
    """
    # Within this function pcNum is 1-origin to match how users would call 
    # the various user-level functions
    if self.verbose:
      print '  Writing results for PC %d' % pcNum

    if pcNum > 1:
      f = self.plotPCvsPCWithOutliersInY(pcNum-1, pcNum)
      pylab.savefig('pc%02d-outliers' % pcNum)
  
    f = self.plotPCNEigenvectorInOriginalColumnOrder(pcNum,conditionLabels)
    pylab.savefig('pc%02d-eigenvector' % pcNum)
  
    f = self.plotPCNOutlierRowsInOriginalColumnOrder(pcNum,conditionLabels)
    pylab.savefig('pc%02d-outlier-trajectories-order-original' % pcNum)

    f = self.plotPCNOutlierRowsInSigGroupOrder(pcNum,conditionLabels)
    if f is not None:
      pylab.savefig('pc%02d-outlier-trajectories-order-meandiff' % pcNum)
  
    output = self.getOutputForPCNOutliers(pcNum,rowLabelingNames)
    write2DStringArrayToFile(output, 'pc%02d-outliers.txt' % pcNum)
  
    output = self.getOutputForSigGroups(pcNum,colLabelingNames)
    if output is not None:
      write2DStringArrayToFile(output, 'pc%02d-condition-groups.txt' % pcNum)


  def plotScores(self, columnLabelingName, pcNum):
    """
    Generate a plot comparing the distributions of values within the PCNum
    imposed Up, Flat and Down column partitions for the given column labeling.
    The columnLabeling should be a numeric labeling, but empty values are OK.
    
    NOTE: This is not right. need to plot labeling w/ value A (e.g. 'flat') 
    versus labeling w/ value B (e.g. 'down')
    """
    
    # get up/flat/down labeling for given principal component
    ufd = self._getUpFlatDownLabelingByPCN(pcNum)
    
    upcols   = ufd.getColsByLabel('up')
    flatcols = ufd.getColsByLabel('flat')
    downcols = ufd.getColsByLabel('down')
    
    l = self.dataset.getLabeling(columnLabelingName)
    
    # get the up, flat and down values for that labeling
    upvals   = l.getLabelsByCols(upcols)
    upvals = [ x[0] for x in upvals if x ]
    
    flatvals = l.getLabelsByCols(flatcols)
    flatvals = [ x[0] for x in flatvals if x ]
    
    downvals = l.getLabelsByCols(downcols)
    downvals = [ x[0] for x in downvals if x ]
    
    f = pylab.figure()
    if len(upvals) > 0:
      pylab.plot(pylab.ones(len(upvals)),upvals,'ro')
    if len(flatvals) > 0:
      pylab.plot(pylab.ones(len(flatvals))+1,flatvals,'ko')
    if len(downvals) > 0:
      pylab.plot(pylab.ones(len(downvals))+2,downvals,'bo')

    # should construct these strings automatically given above info, scores, etc
    pylab.title('%s Distributions for PC-%2d' %(columnLabelingName,pcNum))
    pylab.xlabel('Condition Partition')
    pylab.ylabel(columnLabelingName)
    pylab.xlim(0,4)
    pylab.xticks(pylab.arange(4),('','Up (N=%d)' % len(upvals),
                                  'Flat (N=%d)' % len(flatvals),
                                  'Down (N=%d)' % len(downvals)))
    
    
    return f


def write2DStringArrayToFile(stringArray, filename, delim='\t'):
  """
  Simple utility function to spew a 2D string array to a tab-delimited
  text file.
  """
  fd = open(filename,'w')
  for row in stringArray:
    for col in row:
      fd.write('%s%s' %(col,delim))
    fd.write('\n')
  fd.close()
    

def secondColumnCompare(val1,val2):
  """
  Used by getOutputForPCNOutliers to sort the output by the second column
  value, in descending order.
  """
  if val1[1] > val2[1]:
    return -1
  else:
    return 1