##################################
#                                #
# Last modified 2021/08/14       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s input labelfields valuefield outputfilename' % sys.argv[0]
        sys.exit(1)
    
    input = sys.argv[1]
    outfilename = sys.argv[4]
    outfile = open(outfilename, 'w')

    fields = sys.argv[2].split(',')
    labelFields=[]
    for f in fields:
        labelFields.append(int(f))
    labelFields.sort()

    print labelFields

    VFID = int(sys.argv[3])

    print VFID

    DataDict = {}

    LC = 0

    linelist = open(input)
    for line in linelist:
        LC += 1
        if LC % 1000000 == 0:
            print LC/1000000, 'M lines processed'
        fields = line.replace('\x00','').strip().split('\t')
        if line.startswith('#') or line.startswith('tracking_id'):
            outline = ''
            for ID in labelFields:
                outline = outline + fields[ID] + '\t'
            outline = outline + 'total\tfraction'
            outfile.write(outline + '\n')
            continue
        label = []
        for ID in labelFields:
            label.append(fields[ID])
        label = tuple(label)
        if DataDict.has_key(label):
            pass
        else:
            DataDict[label] = []
        DataDict[label].append(float(fields[VFID]))

    for label in DataDict.keys():
        L = list(label)
        outline = ''
        for f in L:
            outline = outline + f + '\t'
        if sum(DataDict[label]) == 0:
            outline = outline + str(sum(DataDict[label])) + '\t' + 'nan'
        else:
            outline = outline + str(sum(DataDict[label])) + '\t' + str(max(DataDict[label])/sum(DataDict[label]))
        outfile.write(outline + '\n') 
            
    outfile.close()
   
run()
