##################################
#                                #
# Last modified 03/23/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set


def run():

    if len(sys.argv) < 2:
        print 'usage: python %s <list of input files> outputfilename [-rescale] [-truncate maxValue]' % sys.argv[0]
        print '	format of list of files file: label <tab> filename <tab> <chromosome,start,stop fields> <value field>' 
        print '	three fields are expected for chromosome,start,stop, they will be used to build unique IDs of the chromosome:start-stop form' 
        sys.exit(1)
    
    files = sys.argv[1]
    outfilename = sys.argv[2]

    doTruncate = False
    if '-truncate' in sys.argv:
        doTruncate = True
        maxValue = float(sys.argv[sys.argv.index('-truncate')+1])

    doRescale = False
    if '-rescale' in sys.argv:
        doRescale = True

    maxDict = {}
    minDict = {}

    LabelToFileDict={}
    linelist=open(files)
    for line in linelist:
        fields=line.strip().split('\t')
        label=fields[0]
        LabelToFileDict[label]={}
        file=fields[1]
        fieldfields = fields[2].split(',')
        labelFields=[]
        for fieldID in fieldfields:
            labelFields.append(int(fieldID))
        ValueField=int(fields[3])
        LabelToFileDict[label]['file']=file
        LabelToFileDict[label]['labelFields']=labelFields
        if len(LabelToFileDict[label]['labelFields']) != 3:
            print '3 label fields expected', len(LabelToFileDict[label]['labelFields']), 'given, exiting'
            sys.eixt(1)
        LabelToFileDict[label]['ValueField']=ValueField

    outfile = open(outfilename, 'w')

    LabelKeys=LabelToFileDict.keys()
    LabelKeys.sort()

    outline='#ID\t'
    for label in LabelKeys:
        outline=outline+label+'\t'
    outfile.write(outline.strip()+'\n')

    DataDict={}
    for label in LabelKeys:
        minDict[label] = 200000000000000000
        maxDict[label] = -1
        file=LabelToFileDict[label]['file']
        print 'inputting data', label, file
        linelist=open(file)
        ValueField=LabelToFileDict[label]['ValueField']
        labelFields=LabelToFileDict[label]['labelFields']
        print label, file, ValueField, labelFields
        i=0
        for line in linelist:
            i+=1
            if i % 1000000 == 0:
                print i
            if line[0]=='#':
                continue
            fields=line.strip().split('\t')
            try:
                value=float(fields[ValueField])
            except:
                print 'exiting', ValueField, fields
                sys.exit(1)
            DataID = fields[labelFields[0]] + ':' + fields[labelFields[1]] + '-' + fields[labelFields[2]]
            if DataDict.has_key(DataID):
                pass
            else:
                DataDict[DataID] = DataID
            if doTruncate:
                value = min(maxValue,value)
            if doRescale:
                if value < minDict[label]:
                    minDict[label] = value
                if value > maxDict[label]:
                    if doTruncate:
                        maxDict[label] = min(value,maxValue)
                    else:
                        maxDict[label] = value
        if doRescale:
            range = maxDict[label] - minDict[label]
            print range, maxDict[label], minDict[label], maxValue
        i=0
        print 'rescaling data'
        linelist=open(file)
        for line in linelist:
            i+=1
            if i % 1000000 == 0:
                print i
            if line[0]=='#':
                continue
            fields=line.strip().split('\t')
            DataID = fields[labelFields[0]] + ':' + fields[labelFields[1]] + '-' + fields[labelFields[2]]
            value=float(fields[ValueField])
            if doTruncate:
                value = min(maxValue,value)
            if doRescale:
                value = (value - minDict[label]) / range
                if value < 0.001:
                    DataDict[DataID] = DataDict[DataID] + '\t0.001'
                else:
                    DataDict[DataID] = DataDict[DataID] + '\t' + str(value).split('.')[0] + '.' + str(value).split('.')[1][0:3]

    DataKeys=DataDict.keys()
    DataKeys.sort()
    for DataID in DataKeys:
        outfile.write(DataDict[DataID] + '\n')

    outfile.close()
            
run()
