##################################
#                                #
# Last modified 2019/01/05       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import gzip
import math
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s  datafilename <fields to keep (0,1,2,...)> config outfilename [-average] [-cufflinksStatus]' % sys.argv[0]
        print '       config file format: fieldID1,fieldID2,(fieldID3....) <tab> label' 
        sys.exit(1)

    doAverage=False
    if '-average' in sys.argv:
        doAverage=True
        print 'will average values across columns'

    doCufflinksStatus=False
    if '-cufflinksStatus' in sys.argv:
        doCufflinksStatus=True
        print 'will treat values as having a Cufflinks FAIL/OK/LOWDATA tag'

    datafilename = sys.argv[1]
    fieldsToKeep=[]
    for ID in sys.argv[2].split(','):
        fieldsToKeep.append(int(ID))
    fieldsToKeep.sort()
    config = sys.argv[3]
    outfilename = sys.argv[4]

    NewColumnsDict={}
    lineslist  = open(config)
    for line in lineslist:
        fields = line.strip().split('\t')
        label = fields[1]
        if NewColumnsDict.has_key(label):
            print 'duplicate new labels encoutnered, exiting'
            sys.exit(1)
        IDs=fields[0].split(',')
        IDList=[]
        for ID in IDs:
            IDList.append(int(ID))
        NewColumnsDict[label]=IDList

    outfile = open(outfilename, 'w')

    keys=NewColumnsDict.keys()
    keys.sort()

    outline='#'
    for ID in fieldsToKeep:
        outline=outline+'\t'
    for label in keys:
        outline=outline+label+'\t'
    outfile.write(outline.strip()+'\n')

    if datafilename.endswith('.gz'):
        lineslist = gzip.open(datafilename)
    else:
        lineslist = open(datafilename)
    t=0
    for line in lineslist:
        t+=1
        if t % 1000000 == 0:
            print t, 'lines processed'
        fields = line.strip().split('\t')
        if line[0]=='#':
            continue
        if line.startswith('tracking_id'):
            continue
        outline=''
        for ID in fieldsToKeep:
            outline=outline+fields[ID]+'\t'
        for label in keys:
            newvalue=0
            if doCufflinksStatus:
                notFAIL = 0
                p=0
                for ID in NewColumnsDict[label]:
                    value = float(fields[ID].split(',')[0])
                    status = fields[ID].split(',')[1]
                    if status != 'FAIL':
                        notFAIL+=1
                        newvalue += value
                if notFAIL == 0:
                    outline=outline + 'FAIL\t'
                else:
                    if doAverage:
                        outline=outline+str(newvalue/len(NewColumnsDict[label]))+'\t'
                    else:
                        outline=outline+str(newvalue)+'\t'
            else:
                for ID in NewColumnsDict[label]:
                    newvalue=newvalue+float(fields[ID])
                if doAverage:
                    outline=outline+str(newvalue/len(NewColumnsDict[label]))+'\t'
                else:
                    outline=outline+str(newvalue)+'\t'
        outfile.write(outline.strip()+'\n')

    outfile.close()
        
run()

