##################################
#                                #
# Last modified 12/23/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outputfilename' % sys.argv[0]
        sys.exit(1)

    GTF = sys.argv[1]
    outfilename = sys.argv[2]

    GeneDict={}
    linelist=open(GTF)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        if 'gene_name "' in fields[8]:
            geneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            geneName = geneID
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if GeneDict.has_key((geneID,geneName)):
            pass
        else:
            GeneDict[(geneID,geneName)]={}
        GeneDict[(geneID,geneName)][transcriptID]=''

    outfile = open(outfilename, 'w')
    outfile.write('#GeneID\tGeneName\tNumberTranscripts\n')
    for (geneID,geneName) in GeneDict.keys():
        outline = geneID + '\t' + geneName + '\t' + str(len(GeneDict[(geneID,geneName)].keys()))
        outfile.write(outline + '\n')

    outfile.close()
   
run()
