##################################
#                                #
# Last modified 11/02/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s list_of_RPM_files label_fields RPM_field SAMstats_outfilename'
        print '\t label fields should be comma-separated or in "start:end" format, start and end included' 
        print '\format of list_of_RPM_files: filename <tab> total_mapped_reads' 
        sys.exit(1)

    RPM_files = sys.argv[1]
    labels = []
    fields = sys.argv[2].split(',')
    for ID in fields:
        if ':' in ID:
            start = int(ID.split(':')[0])
            end = int(ID.split(':')[1])
            for i in range(start,end+1):
                labels.append(i)
        else:
            labels.append(int(ID))
    labels.sort()
    RPMfieldID = int(sys.argv[3])
    outfilename = sys.argv[4]

    outfile = open(outfilename, 'w')

    splitBy = '\t'

    HasHeader = False
 
    DataDict={}
    TotalNumberReads = 0
    linelist = open(RPM_files)
    t=0
    for line1 in linelist:
        RPM_file_fields = line1.strip().split('\t')
        filename = RPM_file_fields[0]
        total_reads = int(RPM_file_fields[1])
        TotalNumberReads += total_reads
        listoflines = open(filename)
        for line in listoflines:
            if line[0]=='#':
                outline = line
                HasHeader = True
                continue
            fields = line.strip().split(splitBy)
            label = []
            for ID in labels:
                label.append(fields[ID])
            label = tuple(label)
            if DataDict.has_key(label):
                pass
            else:
                DataDict[label] = 0
            RPM = float(fields[RPMfieldID])
            reads = RPM*(total_reads/1000000)
            DataDict[label] += reads

    if HasHeader:
        outfile.write(outline)

    for label in DataDict.keys():
        label_fields = list(label)
        outline = ''
        for ID in label_fields:
            outline = outline + ID + '\t'
        reads = DataDict[label]
        finalRPM = reads/(TotalNumberReads/1000000)
        outline = outline + str(finalRPM)
        outfile.write(outline + '\n')

    outfile.close()
        
run()

