##################################
#                                #
# Last modified 11/04/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s gtf biotype outfilename' % sys.argv[0]
        sys.exit(1)

    gtf = sys.argv[1]
    biotype = sys.argv[2]
    outfilename = sys.argv[3]

    outfile = open(outfilename, 'w')
    outline = '#geneID\tgeneName\ttranscriptID'
    outfile.write(outline + '\n')

    geneDict={}
    lineslist  = open(gtf)
    for line in lineslist:
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        if fields[1] == biotype:
            geneID = fields[8].split('gene_id "')[1].split('";')[0]
            transcriptID = fields[8].split('transcript_id "')[1].split('";')[0]
            geneName = fields[8].split('gene_name "')[1].split('";')[0]
            geneDict[(geneID,transcriptID,geneName)]=''

    for (geneID,transcriptID,geneName) in geneDict.keys():
        outline = geneID + '\t' + geneName + '\t' + transcriptID
        outfile.write(outline + '\n')

    outfile.close()
        
run()

