##################################
#                                #
# Last modified 2018/11/21       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outfileprefix' % sys.argv[0]
        sys.exit(1)

    gtf = sys.argv[1]
    outfileprefix = sys.argv[2]

    OutfileDict={}
    OutfileDict['all'] = open(outfileprefix + '.' + 'combined', 'w')
    outline = '#GeneID\t#GeneName\tTranscriptID\tTranscriptName\tBioType\n'
    OutfileDict['all'].write(outline)

    SeenDict={}

    lineslist  = open(gtf)
    for line in lineslist:
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        transcriptID = fields[8].split('transcript_id "')[1].split('"')[0]
        geneID = fields[8].split('gene_id "')[1].split('"')[0]
        if 'transcript_name "' in fields[8]:
            transcriptName = fields[8].split('transcript_name "')[1].split('"')[0]
        else:
            transcriptName = transcriptID
        if 'gene_name "' in fields[8]:
            geneName = fields[8].split('gene_name "')[1].split('"')[0]
        else:
            geneName = geneID
        geneID = fields[8].split('gene_id "')[1].split('"')[0]
        if 'transcript_type' in fields[8]:
            transcript_type = fields[8].split('transcript_type "')[1].split('"')[0]
        elif 'gene_type' in fields[8]:
            transcript_type = fields[8].split('gene_type "')[1].split('"')[0]
        else:
            transcript_type = fields[8].split('gene_biotype "')[1].split('"')[0]
        if SeenDict.has_key(transcriptID):
            continue
        else:
            SeenDict[transcriptID]=0
        if OutfileDict.has_key(transcript_type):
            pass
        else:
            OutfileDict[transcript_type]=open(outfileprefix + '.' + transcript_type + '.IDs', 'w')
            print OutfileDict[transcript_type]
            outline = '#GeneID\t#GeneName\tTranscriptID\tTranscriptName\n'
            OutfileDict[transcript_type].write(outline)
        outline = geneID + '\t' + geneName + '\t' + transcriptID + '\t' + transcriptName
        OutfileDict[transcript_type].write(outline +'\n')
        outline = outline + '\t' + transcript_type
        OutfileDict['all'].write(outline + '\n')

    for transcript_type in OutfileDict.keys():
        OutfileDict[transcript_type].close()
        
run()

