import os
import math
import re
import string

from compClust.gui.DataSource import DataSource
from compClust.mlx import views
from compClust.mlx import labelings
from compClust.gui.LabelingSource import LabelingSource

def multirun_kenji(ds):
  # run clustering
  from compClust.mlx import wrapper
  clustDict={'distance_metric':'correlation_centered',
             'init_method':'church_means',
             'k':5,
             'multiRun_parameter_name':'seed',
             'multiRun_parameter_values':[2881,
                                      2325,
                                      1744,
                                      912,
                                      502,
                                      2235,
                                      491,
                                      1268,
                                      2000,
                                      2892],
             'num_iterations':75,
             'seed':42}
  
  aw = wrapper.MultiRun(ds.dataset, clustDict,wrapper.DiagEM())
  aw.validate()
  aw.run()
  labeling = aw.getLabeling()
  name = 'multirun-5'
  labeling.setName(name)
  source = LabelingSource(name, isrow=True, isannotation=False)
  ds._labeling_sources[name] = source
  
def load_4366_filtered_conditions():
  """Load kenji dataset and some default labelings"""
  base_dir = os.path.expanduser('~/proj/kenji_neuralcrest/joe')

  datafile = os.path.join(base_dir, 'kenji-averaged-4366-genes-14-cond.txt')
  ds = DataSource(datafile)
  ds.dataset = views.FunctionView(ds.dataset, lambda x: math.log(x, 2))

  labels = [
    ('BAWnum', os.path.join(base_dir,'kenji-averaged-4366-genes-14-cond-RL-BAWnum.txt'), True, True, None),
    ('Description', os.path.join(base_dir, 'kenji-averaged-4366-genes-14-cond-RL-SeqDescription.txt'), True, True, None),
    ('conditions', os.path.join(base_dir, 'kenji-averaged-4366-genes-14-cond-CL-condition.txt'), False, True, None)]

  for name, path, isrow, isannotation, url in labels:
    ds.add_labeling(name, path, isrow, isannotation, url)

  ds.primary = 'BAWnum'
  ds.secondary = 'Description'
  return ds


def load_annotated_tab_file(filename, name, column=None, row=None):
  """Load a tab delimited text file with annotations embedded in it

  Filename is the tab delimited text file to load
  name is the name to assign to the created DataSource

  column is the name for the column annotation, or none if there are none.
  (it's hard to know how to meaningfully specify multiple column annotations in
  this file format)
  
  Row is a dictionaries of the form "annoation_name": int
  The integer represents which column contains the label for annotation_name's row
  labeling.

  The column count is just directly used by python so it's zero based. 

  A file of the format

  Accession_id     Description   1hr       2hr      3hr     4hr
  A0001001_01      ORF23114      .23       .421     1.23    3.71
  A0002314_02      YFG_001       .271      9.131    3.132   2.145

  would be loaded by
  load_annotated_tab_file("file", "time", column="Time",
                           row={'Accesion_id': 0, 'Description': 1 })
  """
  data_stream = open(filename, 'r')
  tab_re = re.compile("\t")

  # build map and storage for row annotations
  row_annotations = {}
  row_annotation_map = {}
  if row is not None:
    for annotation_name, column_id in row.items():
      row_annotations[annotation_name] = []
      row_annotation_map[column_id] = row_annotations[annotation_name]

  # if we have a column annotation suck it in too
  if column is not None:
    column_annotation = []
    file_col_header = tab_re.split(data_stream.readline())
    for column_index in range(len(file_col_header)):
      if column_index not in row_annotation_map.keys():
        column_annotation.append(file_col_header[column_index])

  # accumulators
  data = []
  count = 0
  # process rows out of file
  for file_row in data_stream.xreadlines():
    file_row = tab_re.split(string.strip(file_row))
    # process columns in the row
    data_row = []
    for element_index in xrange(len(file_row)):
      if row_annotation_map.has_key(element_index):
        row_annotation_map[element_index].append(file_row[element_index])
      else:
        datum = float(file_row[element_index])
        if datum == 1:
          datum = 1.000000001
        data_row.append(datum)
    data.append(data_row)
    # progress bar?
    count += 1
    if (count % 1000) == 0:
      print '\b.',
      count = 0
  ds = DataSource(data)
  # log 2 transform the data
  ds.dataset = views.FunctionView(ds.dataset, lambda x: math.log(x, 2))
  ds.name = name
  # add row annotations
  for annotation_name, labeling_data in row_annotations.items():
    ds.add_labeling(annotation_name, labeling_data, isrow=True, isannotation=True)

  # add column annotation
  if column is not None:
    ds.add_labeling(column, column_annotation, isrow=False, isannotation=True)

  return ds
  
def load_kenji_ratio_unfiltered_8799_by_10_unfiltered_scaled():
  """Load unfiltered full dataset with all the labelings in one text file
  """
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/Kenji_complete_ratios_8799_X_10_unfiltered_scaled.txt")
  row_annotations = {'BAW index': 0,
                     'Sequence ID': 1,
                     'Sequence Description': 2}
  
  ds = load_annotated_tab_file(datafilename, 'kenji 8799 unfiltered', column='conditions', row=row_annotations)
  ds.primary = 'BAW index'
  ds.secondary = 'Sequence Description'
  return ds

def load_kenji_ratio_invarient_3993_by_10():
  """Load unfiltered full dataset with all the labelings in one text file
  """
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/Kenji_ratios_3993_X_10_invariants_out.txt")
  row_annotations = {'BAW index': 0,
                     'Sequence ID': 1,
                     'Sequence Description': 2}
  ds = load_annotated_tab_file(datafilename, "kenji 3993 invariant", column='conditions', row=row_annotations)
  ds.primary = 'BAW index'
  ds.secondary = 'Sequence Description'
  return ds

def load_kenji_affy_4278_x_10():
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/Kenji_revised_affy_4278_X_10.txt")
  row_annotations = {'Probe Set ID': 0,
                     'Target Description': 1,
                     'Gene Title': 2}
  ds = load_annotated_tab_file(datafilename, "kenji affy 4278 x 10", column='conditions', row=row_annotations)
  ds.primary = 'Probe Set ID'
  ds.secondary = 'Target Description'
  return ds

def load_kenji_affy_2042_x_10():
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/Kenji_affy_2042_X_10.txt")
  row_annotations = {'Probe Set ID': 0,
                     'Target Description': 1,
                     'Gene Title': 2}
  ds = load_annotated_tab_file(datafilename, "kenji filtered affy 2042 x 10", column='conditions', row=row_annotations)
  ds.primary = 'Probe Set ID'
  ds.secondary = 'Target Description'
  return ds

def load_kenji_2042_X_10_ratios_over_FcIg():
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/2042_X_10_ratios_over_FcIg.txt")
  row_annotations = {'Probe Set ID': 0,
                     'Target Description': 1,
                     'Gene Title': 2}
  ds = load_annotated_tab_file(datafilename, "2042 x 10 FcIg", column='conditions', row=row_annotations)
  ds.primary = 'Probe Set ID'
  ds.secondary = 'Target Description'
  return ds

def load_kenji_2042_X_10_ratios_over_Mock():
  datafilename = os.path.expanduser("~/proj/kenji_neuralcrest/2042_X_10_ratios_over_Mock.txt")
  row_annotations = {'Probe Set ID': 0,
                     'Target Description': 1,
                     'Gene Title': 2}
  ds = load_annotated_tab_file(datafilename, "2042 x 10 Mock", column='conditions', row=row_annotations)
  ds.primary = 'Probe Set ID'
  ds.secondary = 'Target Description'
  return ds

def load_kenji(ginsu=False, cluster=False):
  """Load default example data sets
  """
  from compClust.mlx.pcaGinsu import pcaGinsu

  kenji_sets = []

  ## these two datasets were processed through resolver
  #kenji_sets.append(load_kenji_ratio_unfiltered_8799_by_10_unfiltered_scaled())
  #kenji_sets.append(load_kenji_ratio_invarient_3993_by_10())

  # this dataset is straight from the affy scanner
  kenji_sets.append(load_kenji_affy_4278_x_10())
  kenji_sets.append(load_kenji_affy_2042_x_10())

  # these datasets are the results of comparing data and the controls
  kenji_sets.append(load_kenji_2042_X_10_ratios_over_Mock())
  kenji_sets.append(load_kenji_2042_X_10_ratios_over_FcIg())

  if ginsu:
    ginsu_kenji(kenji_sets)

  if cluster:
    for set in kenji_sets:
      multirun_clustering(set, [10])
      multirun_clustering(set, [20])
    
  return kenji_sets


def ginsu_kenji(kenji_sets):
  from compClust.mlx.pcaGinsu import pcaGinsuVisualizeMatplotlib as PCAGinsu
  for set in kenji_sets:
    PCAGinsu(set.dataset)

def multirun_clustering(datasource, k_list):
  from compClust.mlx.wrapper import DiagEM, MultiRun, KMeans
  import copy
  diagem_param={'distance_metric':'correlation',
             'init_method':'random_point',
             'init_means': 'church',
             'k':20,
             'k_strict':'false',}
  
  multirun_param={'parameter_name': 'k',
                  'parameter_values':k_list,
                  'num_trials':75,
                  'seed':42}
  name = "MultiRun_cor mul=%s mul=5 mul=k" % (str(k_list))
  clustering = MultiRun(datasource.dataset, multirun_param,DiagEM(diagem_param))
  clustering.validate()
  clustering.run()
  clustering_labeling = clustering.getLabeling()
  clustering_labeling.setName(name)
  source = LabelingSource(name, isrow=True, isannotation=False)
  datasource._labeling_sources[name] = source
  print "done with ", datasource.name, name
  return clustering_labeling

# default outputlabelings we care about for the kenji dataset
output_labeling_names = 'Probe Set ID', 'Gene Title', 'Target Description'

if __name__ == "__main__":
  multirun_clustering(load_kenji_2042_X_10_ratios_over_Mock())
