##################################
#                                #
# Last modified 12/29/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s GTF chrom.sizes TSS_window_size outfilename' % sys.argv[0]
        sys.exit(1)

    GTF = sys.argv[1]
    chrom_sizes = sys.argv[2]
    window=int(sys.argv[3])
    outfilename = sys.argv[4]

    outfile = open(outfilename, 'w')

    TranscriptDict = {}

    listoflines = open(GTF)
    for line in listoflines:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        chr = fields[0]
        left = int(fields[3])
        right = int(fields[4])
        strand = fields[6]
        transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
        if TranscriptDict.has_key(chr):
            pass
        else:
            TranscriptDict[chr]={}
        if TranscriptDict[chr].has_key(transcriptID):
            pass
        else:
            TranscriptDict[chr][transcriptID]=[]
        TranscriptDict[chr][transcriptID].append((chr,left,right,strand))

    print 'finished inputting annotation'

    listoflines = open(chrom_sizes)
    for line in listoflines:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[0]    
        size = int(fields[1])
        coverageDict = {}
        print chr
#       1 -> TSS
#       2 -> exon
#       3 -> intron
#       4 -> intergenic
        if TranscriptDict.has_key(chr):
            pass
        else:
            outline = chr + '\t0\t' + str(size) + '\tintergenic'
            outfile.write(outline + '\n')
            continue
        for transcriptID in TranscriptDict[chr].keys():
            (chr,left,right,strand) = TranscriptDict[chr][transcriptID][0]
            if strand == '-':
                TSS = TranscriptDict[chr][transcriptID][-1][2]
            if strand == '+':
                TSS = TranscriptDict[chr][transcriptID][0][1]-1
            for i in range(TSS-window, TSS + window):
                coverageDict[i] = 1
        for transcriptID in TranscriptDict[chr].keys():
            for (chr,left,right,strand) in TranscriptDict[chr][transcriptID]:
                for i in range(left-1,right):
                    if coverageDict.has_key(i):
                        continue
                    else:
                        coverageDict[i] = 2
        for transcriptID in TranscriptDict[chr].keys():
            TranscriptDict[chr][transcriptID].sort()
            if len(TranscriptDict[chr][transcriptID]) == 1:
                continue
            for e in range(len(TranscriptDict[chr][transcriptID])-1):
                for i in range(TranscriptDict[chr][transcriptID][e][2],TranscriptDict[chr][transcriptID][e+1][1]):
                    if coverageDict.has_key(i):
                        continue
                    else:
                        coverageDict[i] = 3
        if coverageDict.has_key(0):
            current = coverageDict[0]
        else:
            current = 4
        initial = 0
        for i in range(size):
            if current == 4:
                if coverageDict.has_key(i):
                    outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\tintergenic'
                    if max(0,initial) == 0 and i == 0:
                        pass
                    else:
                        outfile.write(outline + '\n')
                    current = coverageDict[i]
                    initial = i
                else:
                    continue
            else:
                if coverageDict.has_key(i):
                    if coverageDict[i] == current:
                        continue
                    else:
                        if current == 1:
                            type = 'TSS'
                            outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                        if current == 2:
                            type = 'exonic'
                            outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                        if current == 3:
                            type = 'intronic'
                            outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                        outfile.write(outline + '\n')
                        current = coverageDict[i]
                        initial = i
                else:
                    if current == 1:
                        type = 'TSS'
                        outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                    if current == 2:
                        type = 'exonic'
                        outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                    if current == 3:
                        type = 'intronic'
                        outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i) + '\t' + type
                    outfile.write(outline + '\n')
                    current = 4
                    initial = i
        outline = chr + '\t' + str(max(0,initial)) + '\t' + str(i+1) + '\t' + 'intergenic'
        outfile.write(outline + '\n')
                
    outfile.close()

run()
