##################################
#                                #
# Last modified 07/11/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s input repeatIDField scoreField outputfilename' % sys.argv[0]
        print '       assumed input file is the output of bedtoRPKM run on the repeat masker from UCSC'
        sys.exit(1)
    
    input = sys.argv[1]
    repeatID = int(sys.argv[2])
    scoreID = int(sys.argv[3])
    outfilename = sys.argv[4]

    RepeatDict={}

    outfile = open(outfilename,'w')

    lineslist = open(input)
    i=0
    for line in lineslist:
        i+=1
        if i % 1000000 == 0:
            print i, 'lines processed'
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        repeat=fields[repeatID]
        score=float(fields[scoreID])
        if RepeatDict.has_key(repeat):
            pass
        else:
            RepeatDict[repeat]=0
        RepeatDict[repeat]+=score

    RepeatScoreList=[]

    for repeat in RepeatDict.keys():
        RepeatScoreList.append((RepeatDict[repeat],repeat))

    RepeatScoreList.sort()
    RepeatScoreList.reverse()

    for (score,repeat) in RepeatScoreList:
        outline=repeat + '\t' + str(score)
        outfile.write(outline+'\n')

    outfile.close()
   
run()
