##################################
#                                #
# Last modified 08/22/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import math
import random
import string

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s gtf RepeatMasker chrFieldID elementIDs outfilename' % sys.argv[0]
        sys.exit(1)

    gtf = sys.argv[1]
    RepeatMasker = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    ElementFieldIDs = []
    fields = sys.argv[4].split(',')
    for ID in fields:
        ElementFieldIDs.append(int(ID))
    ElementFieldIDs.sort()
    outputfilename = sys.argv[5]

    j=0
    lineslist = open(gtf)
    TranscriptDict = {}
    for line in lineslist:
        j+=1
        if j % 100000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2]!='exon':
            continue
        chr=fields[0]
        if TranscriptDict.has_key(chr):
            pass
        else:
            TranscriptDict[chr] = {}
        if 'gene_name "' in fields[8]:
            geneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            geneName=fields[8].split('gene_id "')[1].split('";')[0]
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        if 'transcript_name "' in fields[8]:
            transcriptName=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            transcriptName=fields[8].split('transcript_id "')[1].split('";')[0]
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        transcript = (geneID, geneName, transcriptName, transcriptID)
        if TranscriptDict[chr].has_key(transcript):
            pass
        else:
            TranscriptDict[chr][transcript]=[]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        TranscriptDict[chr][transcript].append((chr,left,right,orientation))

    print 'finished parsing annotation'

    j=0
    lineslist = open(RepeatMasker)
    RepeatDict = {}
    for line in lineslist:
        j+=1
        if j % 1000000 == 0:
            print j, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        chr = fields[chrFieldID]
        if RepeatDict.has_key(chr):
            pass
        else:
            RepeatDict[chr] = {}
        Repeat = []
        for ID in ElementFieldIDs:
            Repeat.append(fields[ID])
        Repeat = tuple(Repeat)
        if RepeatDict[chr].has_key(Repeat):
            pass
        else:
            RepeatDict[chr][Repeat] = []
        RepeatDict[chr][Repeat].append(fields)

    print 'finished parsing repeats'

    outfile = open(outputfilename, 'w')

    outline = '#GeneID\tGeneName\tTranscriptID\tTranscriptName\tOverlap\tRepeatMasker_fields'
    outfile.write(outline + '\n')

    for chr in TranscriptDict.keys():
        print chr
        CoverageDictExonic = {}
        CoverageDictIntronic = {}
        TranscriptToFullAnnotationDict = {}
        t = 0
        for transcript in TranscriptDict[chr].keys():
            t+=1
            TranscriptToFullAnnotationDict[t]=transcript
            coordinates = []
            for (chr,left,right,orientation) in TranscriptDict[chr][transcript]:
                coordinates.append(left)
                coordinates.append(right)
                for i in range(left,right):
                    if CoverageDictExonic.has_key(i):
                        CoverageDictExonic[i].append(t)
                    else:
                        CoverageDictExonic[i] = []
                        CoverageDictExonic[i].append(t)
            for i in range(min(coordinates),max(coordinates)):
                if CoverageDictIntronic.has_key(i):
                    CoverageDictIntronic[i].append(t)
                else:
                    CoverageDictIntronic[i] = []
                    CoverageDictIntronic[i].append(t)
        if RepeatDict.has_key(chr):
            pass
        else:
            continue
        RepeatToTranscriptDictExonic = {}
        RepeatToTranscriptDictIntronic = {}
        for repeat in RepeatDict[chr].keys():
            for fields in RepeatDict[chr][repeat]:
                left = int(fields[6])
                right = int(fields[7])
                for i in range(left,right):
                    if CoverageDictExonic.has_key(i):
                        for t in CoverageDictExonic[i]:
                            transcript = TranscriptToFullAnnotationDict[t]
                            RepeatToTranscriptDictExonic[(transcript,tuple(fields))]=0
                    if CoverageDictIntronic.has_key(i):
                        for t in CoverageDictIntronic[i]:
                            transcript = TranscriptToFullAnnotationDict[t]
                            RepeatToTranscriptDictIntronic[(transcript,tuple(fields))]=0
        for (transcript,fields) in RepeatToTranscriptDictIntronic.keys():
            if RepeatToTranscriptDictExonic.has_key((transcript,fields)):
                overlap = 'exonic'
            else:
                overlap = 'intronic-only'
            (geneID, geneName, transcriptName, transcriptID) = transcript
            outline = geneID + '\t' + geneName + '\t' + transcriptName + '\t' + transcriptID
            outline = outline + '\t' + overlap
            for f in list(fields):
                outline = outline + '\t' + f
            outfile.write(outline + '\n')

    outfile.close()

run()

