"""Provide a examples that can be loaded mostly for the testing code
"""
import cPickle
import math
import os

from compClust.mlx import datasets
from compClust.mlx import labelings
from compClust.mlx import views
from compClust.config import config

def LoadCho():
  # Either grab the environment variable CHO_DATA (which is set in
  # setup.py or hope that the relative path is ok)
  dataroot = config.cho_data_dir
  
  # load data
  ds = datasets.Dataset(os.path.join(dataroot,"ChoCycling.dat"))
    
  # load some labelings
  cho = labelings.GlobalWrapper(ds, "cho clusters")
  cho.labelRows(os.path.join(dataroot, "ChoClassification.rlab"))
  common = labelings.GlobalWrapper(ds,"common name")
  common.labelRows(os.path.join(dataroot, "CommonNames.rlab"))
  orfs = labelings.GlobalWrapper(ds, "orfs")
  orfs.labelRows(os.path.join(dataroot, "ORFs.rlab"))
  em = labelings.GlobalWrapper(ds, "diagem clusters", )
  em.labelRows(os.path.join(dataroot, "EM.rlab"))
  times = labelings.GlobalLabeling(ds,"time points")
  times.labelCols(ds, os.path.join(dataroot, "times.clab"))

  ds.setPrimaryRowLabeling(orfs)

  return ds

def LoadChoSource():
  """This returns a DataSource instead of a dataset.
  
  CompClustWeb needed additional code that the MLX didn't need and so I created a wrapper
  class. Hopefully at somepoint the two classes will be merged to make it easier
  to browse MLX datasets from within the Web GUI.
  """
  # find where our datafile is stored
  dataroot = config.cho_data_dir
  datafile = os.path.join(dataroot, 'ChoCycling.dat')
  
  # make our datasource object
  from compClust.gui.DataSource import DataSource
  ds = DataSource(datafile)
  
  # specify where all of the labelings we want to attach are located
  # (and what kinds of labelings they are)
  labels = [
    ('cho', os.path.join(dataroot,'ChoClassification.rlab'), True, False, None),
    ('em', os.path.join(dataroot, 'EM.rlab'), True, False, None),
    ('common', os.path.join(dataroot, 'CommonNames.rlab'), True, True, None),
    ('orfs', os.path.join(dataroot, 'ORFs.rlab'), True, True, "http://somesite.com/orf=%s"),
    ('times', os.path.join(dataroot, 'times.clab'), False, True, None)]

  # attach everything
  for name, path, isrow, isannotation, url in labels:
    ds.add_labeling(name, path, isrow, isannotation, url)
  
  # specify the "primary" and "secondary" annotation labelings for a datasource.
  # the primary MUST BE unique and it MUST BE specified for many detail plots to work.
  # (The primary name is used to identify which vector should be shown.)
  ds.primary = 'orfs'
  ds.secondary = 'common'
  
  return ds

def LoadPGC(pickle_name=None, rowFiltered=True, colFiltered=True):
  """
  Load a copy of the PGC dataset
  """
  if pickle_name is None:
    pickle_name = os.path.expanduser("~/compclust-pgc.pkl")

  if os.path.exists(pickle_name):
    data = cPickle.load(open(pickle_name,'r')) 
  else:  
    base_url = "http://woldlab.caltech.edu/compclust/examples/pgc_diabetes"

    data = datasets.Dataset(base_url+"/compclust-reannotate_select_cal.log2.dat.gz")
    labeling_files = ["compclust-labeling-Age.clab.gz",
                      "compclust-labeling-BMI.clab.gz",
                      "compclust-labeling-Cap_(mm2).clab.gz",
                      "compclust-labeling-Cap__Type1_(mean_n).clab.gz",
                      "compclust-labeling-Cap__Type2a_(mean_n).clab.gz",
                      "compclust-labeling-Cap__Type2b_(mean_n).clab.gz",
                      "compclust-labeling-Cap__Type2c_(mean_n).clab.gz",
                      "compclust-labeling-Cap_fiber_(mean_n).clab.gz",
                      "compclust-labeling-Centroid_(Using_34_OXPHOS-CR_Genes).clab.gz",
                      "compclust-labeling-Chol.clab.gz",
                      "compclust-labeling-Glucose_0_capillary_blood_who_85.clab.gz",
                      "compclust-labeling-Glucose_120_capillary_blood_who_85.clab.gz",
                      "compclust-labeling-Glycogen_(mmol_kg).clab.gz",
                      "compclust-labeling-Insulin_0.clab.gz",
                      "compclust-labeling-Insulin_120.clab.gz",
                      "compclust-labeling-M_value.clab.gz",
                      "compclust-labeling-Patient_No.clab.gz",
                      "compclust-labeling-Samplename_at_WICGR.clab.gz",
                      "compclust-labeling-Total_Area_(um2).clab.gz",
                      "compclust-labeling-Total_Cap_(n).clab.gz",
                      "compclust-labeling-Total_fibers_(n).clab.gz",
                      "compclust-labeling-Trigs.clab.gz",
                      "compclust-labeling-Type1_(Percent).clab.gz",
                      "compclust-labeling-Type1_(n).clab.gz",
                      "compclust-labeling-Type1_Area_(um2).clab.gz",
                      "compclust-labeling-Type1_Max__Area_(um2).clab.gz",
                      "compclust-labeling-Type1_Mean_Area_(um2).clab.gz",
                      "compclust-labeling-Type1_Min__Area_(um2).clab.gz",
                      "compclust-labeling-Type1__Area__(Percent).clab.gz",
                      "compclust-labeling-Type2a_(Percent).clab.gz",
                      "compclust-labeling-Type2a_(n).clab.gz",
                      "compclust-labeling-Type2a_Area_(Percent).clab.gz",
                      "compclust-labeling-Type2a_Area_(um2).clab.gz",
                      "compclust-labeling-Type2a_Max__Area_(um2).clab.gz",
                      "compclust-labeling-Type2a_Mean_Area_(um2).clab.gz",
                      "compclust-labeling-Type2a_Min__Area_(um2).clab.gz",
                      "compclust-labeling-Type2b_(Percent).clab.gz",
                      "compclust-labeling-Type2b_(n).clab.gz",
                      "compclust-labeling-Type2b_Area_(Percent).clab.gz",
                      "compclust-labeling-Type2b_Area_(um2).clab.gz",
                      "compclust-labeling-Type2b_Max__area_(um2).clab.gz",
                      "compclust-labeling-Type2b_Mean_Area_(um2).clab.gz",
                      "compclust-labeling-Type2b_Min__area_(um2).clab.gz",
                      "compclust-labeling-Type2c_(Percent).clab.gz",
                      "compclust-labeling-Type2c_(n).clab.gz",
                      "compclust-labeling-Type2c_Area_(Percent).clab.gz",
                      "compclust-labeling-Type2c_Area_(um2).clab.gz",
                      "compclust-labeling-Type2c_Max__area_(um2).clab.gz",
                      "compclust-labeling-Type2c_Mean_area_(um2).clab.gz",
                      "compclust-labeling-Type2c_Min__area_(um2).clab.gz",
                      "compclust-labeling-UQCRB_(209065_at).clab.gz",
                      "compclust-labeling-VO2_max_(ml_kg_min_total_body_weight).clab.gz",
                      "compclust-labeling-WHR.clab.gz",
                      "compclust-labeling-status.clab.gz",]
    for l in labeling_files:
      name = l[len("compclust-labeling-"):-len('.clab.gz')]
      new_labeling = labelings.GlobalWrapper(data,name)
      new_labeling.labelCols(base_url+"/"+l)

    labeling_files = ["compclust-labeling-Name.rlab.gz",
                       "compclust-labeling-uid.rlab.gz"]

    for l in labeling_files:
      name = l[len("compclust-labeling-"):-len('.rlab.gz')]
      new_labeling = labelings.GlobalWrapper(data,name)
      new_labeling.labelRows(base_url+"/"+l)
    cPickle.dump(data, open(pickle_name,'w'))

  if rowFiltered: 
    log2_100 = math.log(100,2)
    row_indices = [ row_index for row_index in xrange(data.getNumRows()) 
                              if [ value for value in data.getRowData(row_index)
                                         if value >  log2_100 ]]
    data = views.RowSubsetView(data, row_indices)

  if colFiltered:
    status = data.getLabeling('status')
    diabetic_vs_healthy_cols = status.getColsByLabel('NGT') + status.getColsByLabel('DM2')
    data = views.ColumnSubsetView(data, diabetic_vs_healthy_cols)

  return data

def LoadGNF(pickle_name=None):
  """
  Load GNF dataset

  pickle_name specifies where you want the cached copy of the dataset to live
  """
  if pickle_name is None:
    pickle_name = os.path.expanduser("~/compclust-gnf.pkl")

  if os.path.exists(pickle_name):
    return cPickle.load(open(pickle_name,'r'))

  base_url = "http://woldlab.caltech.edu/compclust/examples/gnf_human"
  data = datasets.Dataset(base_url + "/U133A+GNF1B_101402.AD.filtered.log2.data.tab.gz")

  tissue = labelings.GlobalWrapper(data, 'Tissue')
  tissue.labelCols(base_url + '/U133A+GNF1B_101402.AD.column_labeling.tab.gz')

  probe_id = labelings.GlobalWrapper(data, 'ProbeId')
  probe_id.labelRows(base_url + '/U133A+GNF1B_101402.AD.row_labeling.tab.gz')
  data.setPrimaryRowLabeling(probe_id)
  
  labeling_files = ["U133A+GNF1B_101402.AD.row_labeling-Aliases.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Description.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Ensembl.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Function.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Genome_Location.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-LocusLink.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Name.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Protein_Families.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-RefSeq.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Reporters.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-Taxon.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-UniGene.tab.gz",
                    "U133A+GNF1B_101402.AD.row_labeling-UniProt.tab.gz",]

  for l in labeling_files:
    name = l[len("U133A+GNF1B_101402.AD.row_labeling-"):-len('.tab.gz')]
    new_labeling = labelings.GlobalWrapper(data,name)
    new_labeling.labelRows(base_url+"/"+l)

  cPickle.dump(data, open(pickle_name,'w'))
  return data
