##################################
#                                #
# Last modified 10/09/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outputfilename [-novel annotation_gtf] ' % sys.argv[0]
        sys.exit(1)
    
    gtf = sys.argv[1]
    outfilename = sys.argv[2]

    doNovelOnly = False
    if '-novel' in sys.argv:
        doNovelOnly=True
        KnownTranscriptDict={}
        KnownGeneDict={}
        linelist  = open(sys.argv[sys.argv.index('-novel')+1])
        for line in linelist:
            if line.startswith('#'):
                continue
            fields=line.split('\n')[0].split('\t')
            transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
            KnownTranscriptDict[transcriptID]=0
            geneID=fields[8].split('gene_id "')[1].split('"')[0]
            KnownGeneDict[geneID]=0

    TranscriptDict =  {}

    linelist  = open(gtf)
    transcriptDict={}
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.split('\n')[0].split('\t')
        if fields[2]!='exon':
            continue
        chr = fields[0]
        left = fields[3]
        right = fields[4]
        strand = fields[6]
        transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
        if doNovelOnly:
             geneID=fields[8].split('gene_id "')[1].split('"')[0]
             if KnownTranscriptDict.has_key(transcriptID) or KnownGeneDict.has_key(geneID):
                 continue 
        if transcriptDict.has_key(transcriptID):
            transcriptDict[transcriptID].append((chr,left,right,strand))
        else:
            transcriptDict[transcriptID]=[]
            transcriptDict[transcriptID].append((chr,left,right,strand))

    outfile = open(outfilename, 'w')

    linelist  = open(gtf)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.split('\n')[0].split('\t')
        if fields[2]!='exon':
            continue
        transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
        if len(transcriptDict[transcriptID]) > 1:
            outfile.write(line)

    outfile.close()

run()
