##################################
#                                #
# Last modified 03/13/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outputfilename' % sys.argv[0]
        sys.exit(1)

    GTF = sys.argv[1]
    outfilename = sys.argv[2]

    TranscriptDict={}
    linelist=open(GTF)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if 'gene_name "' in fields[8]:
            geneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            geneName = geneID
        if 'transcript_name "' in fields[8]:
            transcriptName=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            transcriptName = transcriptID
        transcript = (geneID,geneName,transcriptID,transcriptName)
        if TranscriptDict.has_key(transcript):
            pass
        else:
            TranscriptDict[transcript]=[]
        chr = fields[0]
        left = int(fields[3])
        right = int(fields[4])
        strand = fields[6]
        TranscriptDict[transcript].append((chr,left,right,strand))

    outfile = open(outfilename, 'w')
    outfile.write('#GeneID\tGeneName\tTranscriptID\tTranscriptName\tchr\tleft\tright\tstrand\tchr\tleft\tright\tchr\tleft\tright\n')

    transcripts = TranscriptDict.keys()
    transcripts.sort()

    for (geneID,geneName,transcriptID,transcriptName) in transcripts:
        outline_general_terms = geneID + '\t' + geneName + '\t' + transcriptID + '\t' + transcriptName
        TranscriptDict[(geneID,geneName,transcriptID,transcriptName)].sort()
        if len(TranscriptDict[(geneID,geneName,transcriptID,transcriptName)])==1:
            continue
        for i in range(len(TranscriptDict[(geneID,geneName,transcriptID,transcriptName)])-1):
            chr = TranscriptDict[(geneID,geneName,transcriptID,transcriptName)][0][0]
            strand = TranscriptDict[(geneID,geneName,transcriptID,transcriptName)][0][3]
            left_exon = TranscriptDict[(geneID,geneName,transcriptID,transcriptName)][i]
            right_exon = TranscriptDict[(geneID,geneName,transcriptID,transcriptName)][i+1]
            outline = outline_general_terms + '\t' + chr + '\t' + str(left_exon[1]) + '\t' + str(left_exon[2]) + '\t' + strand + '\t' + chr + '\t' + str(left_exon[2]) + '\t' + str(right_exon[1]) + '\t' + chr + '\t' + str(right_exon[1]) + '\t' + str(right_exon[2])
            outfile.write(outline + '\n')
   
    outfile.close()
   
run()
