######################################## # The contents of this file are subject to the MLX PUBLIC LICENSE version # 1.0 (the "License"); you may not use this file except in # compliance with the License. # # Software distributed under the License is distributed on an "AS IS" # basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See # the License for the specific language governing rights and limitations # under the License. # # The Original Source Code is "compClust", released 2003 September 03. # # The Original Source Code was developed by the California Institute of # Technology (Caltech). Portions created by Caltech are Copyright (C) # 2002-2003 California Institute of Technology. All Rights Reserved. ######################################## # # Authors: Benjamin J. Bornstein, Diane Trout, Lucas Scharenbroich # Last Modified: 30-Nov-2001, 15:30 # """ Usage: DiagEM.py parameter_filename input_filename output_filename Wrapper for diagonal em algorithm Depends on the following environment variables: DIAGEM_COMMAND (e.g., /proj/cluster_gazing2/bin/diagem) Brief Algorithm Description: Performs EM segmentation of an array of feature vectors. The algorithm is from Bishop's "Neural Networks for Pattern Recognition", page 65. This particular EM algorithm fits Gaussians to the data. Each element of the feature vector is assumed to be independent (i.e. independent channels). Required Parameters: (note: the list enclosed in the brakets are possible values each one of parameters can take ) k = x is the number of clusters to find num_iterations = Where x is the number of iteration to perform over the data set distance_metric = [correlation, correlation_centered, euclidean] The correlation metric is actually Euclidean distance on the data set mapped to the surface of a hypersphere. This approximates the correlation metric. init_method = [church_means, random_means, random_point, random_range, random_sample, file] Optional / Dependent Parameters: k_strict = ['true', 'false'] Turns on/off k strict behavior, which means that is the exact number k clusters is not found, i.e. there are collapsed clusters, then do not return _any_ results. Collapsed clusters tend to happen more often with the euclidean metric than the correlation metric which can return singleton clusters seed = (optional) Where x is the number used to seed the random number generator. This parameter allows runs of the algorithm to be deterministic. If the parameter is omitted, it will be initialized 42 samples = (depends on init_method) If the random_sample initialization method is chosen, then this parameter defines how many points to sample for each mean. It must be >0 and Starting temperature to run the annealer at. schedule = Temperature schedule. The initial_temp is multiplied by this number every step. Needs to be in the range (0.0, 1.0), but should be in the high 0.90s. em_type = ['scalar', 'diagonal'] Restricts the freedom of the covariance matrix calculations. Assumed to be 'diagonal'. Depreciated Parameters: Not needed (set to constant values) test_fraction train_fraction Superceded by the parameter 'k' min_clusters max_clusters Only applicable to mccv run which are now handled by the MCCV.py wrapper stepsize seed crossvalidation_runs crossvalidation_samples """ import os import re import sys import string import Numeric import tempfile import types from compClust.config import config from compClust.util import Verify from compClust.util import Usage from compClust.util import WrapperUtil from compClust.util.TimeStampedPrintStream import TimeStampedPrintStream from compClust.mlx.datasets import Dataset from compClust.mlx.labelings import Labeling, ClusteredLabeling from compClust.mlx.models import MixtureOfDiagonalGaussians import compClust.mlx.ML_Algorithm as ML_Algorithm from compClust.util.WrapperParameterDescription import WrapperParameterDescription as WPD import compClust.mlx.wrapper MESSAGE_STREAM = TimeStampedPrintStream("%Y-%b-%d %H:%m: DiagEM: ") # # KMeans Parameter Def # # Descriptions - Parameters k_strict_desc = """If \"true\", kmeans will treat k as a strict parameter. That is, if k clusters could not be found, (after an optional num_restarts, in the case of randomly initialized means) no result will be reported.""" max_starts_desc = 'The maximum number of restarts in the case of collapsed clusters (valid only for randomly initialized means).' num_mean_samples_desc = """If init_means = \"random_sample\", this parameter indicates the number of datapoints to sample (without replacement) when estimating initial means. """ seed_desc = 'The seed to use for the pseudo-random number generator (valid only for randomly initialized means).' samples_desc = """If the random_sample initialization method is chosen, then this parameter defines how many points to sample for each mean. It must be >0 and 0: # # load in the variances # # Since these are simply the diagonals of full covariance matricies, # they need to be expanded to full matricies # file = self.parameters["clusteringVarianceFilename"] v = self.readDiagemOutput(file) fullvar = Numeric.zeros((k, v.shape[1], v.shape[1]), Numeric.Float) for i in range(k): for j in range(v.shape[1]): fullvar[i,j,j] = v[i,j] # # Build the model (Mixture of Diagonal Gaussians) # self.model = MixtureOfDiagonalGaussians(k, means, fullvar, weights) else: self.model = None def validate(self): """validate() Ensures that all parameters and environment variables nescessary to run the clustering algorithm (DiagEM) are defined. """ parameterNames = [ "k", "num_iterations", "distance_metric", "init_method" ] fail = 0 if Verify.parameters_exist( parameterNames, self.parameters ): fail = 1 # # Check dependencies # if self.parameters["init_method"] == "random_sample": if Verify.parameters_exist([ "samples" ], self.parameters): fail = 1 else: if self.parameters[ "samples" ] < 1 or \ self.parameters[ "samples" ] > self.dataset.getNumRows(): MESSAGE_STREAM.write("samples is out of range\n") fail = 1 if self.parameters["init_method"] == "file": if Verify.parameters_exist([ "means_file" ], self.parameters): fail = 1 # # Explicitly check for a seed, if one does not exist provide a default # value # if not self.parameters.has_key( "seed" ): self.parameters[ "seed" ] = 42 # # See if annealing is turned on, if it is, then a temp and schedule # must be provided # if self.parameters.setdefault("annealing", "off") == "on": if Verify.parameters_exist( ["initial_temp", "schedule"], self.parameters): fail = 1 # # Fail if the command cannot be executed # if Verify.fs_objects_have_permissions( config.diagem_command , os.X_OK ) == 0: MESSAGE_STREAM.write("%s is not executable." % (config.diagem_command)) fail = 1 return not fail if (__name__ == "__main__"): from compClust.mlx.wrapper import Launcher Launcher.main(sys.argv, DiagEM())