##################################
#                                #
# Last modified 2019/07/09       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import math
from sets import Set
import gzip

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s repeatMasker leftID rightID repeatFieldID chromInfo outputfilename' % sys.argv[0]
        sys.exit(1)
    
    RM = sys.argv[1]
    leftFieldID = int(sys.argv[2])
    rightFieldID = int(sys.argv[3])
    repeatFieldID = int(sys.argv[4])
    chromInfo = sys.argv[5]
    outfilename = sys.argv[6]

    GenomeLength = 0.0

    listoflines = open(chromInfo)
    for line in listoflines:
        if line[0]=='#':
            continue
        fields = line.strip().split('\t')
        length = int(fields[1])
        GenomeLength += length

    print GenomeLength

    RepeatDict = {}

    if RM.endswith('.gz') or RM.endswith('.bgz'):
        listoflines = gzip.open(RM)
    else:
        listoflines = open(RM)
    for line in listoflines:
        if line[0]=='#':
            continue
        fields = line.strip().split('\t')
        left = int(fields[leftFieldID])
        right = int(fields[rightFieldID])
        repeat = fields[repeatFieldID]
        if RepeatDict.has_key(repeat):
            pass
        else:
            RepeatDict[repeat] = 0
        RepeatDict[repeat] += (right-left)
        
    outfile = open(outfilename, 'w')

    outline = '#repeat\ttotal_bp\tfraction_of_genome'
    outfile.write(outline+'\n')
    for repeat in RepeatDict.keys():
        outline = repeat + '\t' + str(RepeatDict[repeat]) + '\t' + str(RepeatDict[repeat]/GenomeLength)
        outfile.write(outline+'\n')

    outfile.close()
   
run()