##################################
#                                #
# Last modified 2019/10/21       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outfilename [-FPKM] [-TPM]' % sys.argv[0]
        print '       Note: if the gtf files is the output of Cufflinks, use the -FPKM option and the expression levels of transcripts covering that bp will be summed' 
        print '       otherwise, the number of transcripts covering that bp will be outputed' 
        sys.exit(1)

    gtf = sys.argv[1]
    outfile = open(sys.argv[2], 'w')

    doFPKM=False
    if '-FPKM' in sys.argv:
        doFPKM=True
        print 'will treat the gtf file as cufflinks output and give FPKM scores'

    doTPM = False
    if '-TPM' in sys.argv:
        doTPM=True
        print 'will treat the gtf file as cufflinks output and give TPM scores'

    TPMDict = {}

    CoverageDict={}
    lineslist = open(gtf)
    i=0
    for line in lineslist:
        if line.startswith('#'):
            continue
        i+=1
        if i % 100000 == 0:
            print i, 'lines processed'
        fields = line.strip().split('\t')
        chr=fields[0]
        if fields[2] == 'transcript':
            tID = fields[8].split('transcript_id "')[1].split('";')[0]
            if doFPKM:
                FPKM = float(fields[8].split('FPKM "')[1].split('";')[0])
                TPMDict[tID] = FPKM
            if doTPM:
                TPM = float(fields[8].split('TPM "')[1].split('";')[0])
                TPMDict[tID] = TPM
        if fields[2]!='exon':
            continue
        if CoverageDict.has_key(chr):
            pass
        else:
            CoverageDict[chr]={}
        start=int(fields[3])
        stop=int(fields[4])
        if doFPKM or doTPM:
            tID = fields[8].split('transcript_id "')[1].split('";')[0]
            FPKM = TPMDict[tID]
#            FPKM = float(fields[8].split('FPKM "')[1].split('";')[0])
            if FPKM == 0:
                continue
            for j in range(start,stop):
                if CoverageDict[chr].has_key(j):
                    CoverageDict[chr][j]+=FPKM
                else:
                    CoverageDict[chr][j]=FPKM
        else:
            for j in range(start,stop):
                if CoverageDict[chr].has_key(j):
                    CoverageDict[chr][j]+=1
                else:
                    CoverageDict[chr][j]=1

    chrlist=CoverageDict.keys()
    chrlist.sort()
    for chr in chrlist:
        posList=CoverageDict[chr].keys()
        posList.sort()
        if len(posList) == 0:
            continue
        sameScorePos=posList[0]
        lastPos=posList[0]
        for pos in posList[1:-1]:
            if CoverageDict[chr][pos]==CoverageDict[chr][sameScorePos] and pos==lastPos+1:
                pass
            else:
                outline=chr+'\t'+str(sameScorePos)+'\t'+str(lastPos+1)+'\t'+str(CoverageDict[chr][sameScorePos])
                outfile.write(outline+'\n')
                sameScorePos=pos
            lastPos=pos
        outline=chr+'\t'+str(sameScorePos)+'\t'+str(lastPos+1)+'\t'+str(CoverageDict[chr][sameScorePos])
        outfile.write(outline+'\n')

    outfile.close()

run()

