##################################
#                                #
# Last modified 12/29/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outfilename' % sys.argv[0]
        print '	a cuffcompare produced gtf file is assumed' 
        sys.exit(1)

    gtf = sys.argv[1]
    outfilename = sys.argv[2]

    geneDict={}

    lineslist  = open(gtf)
    i=0
    for line in lineslist:
        if i % 100000 == 0:
            print i
        i+=1
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        chr=fields[0]
        left=fields[3]
        right=fields[4]
        strand=fields[6]
        if fields[2]!='exon':
            continue
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        if 'gene_name "' in fields[8]:
            geneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            geneName=geneID
        gene_type=fields[8].split('gene_type "')[1].split('";')[0]
        if geneDict.has_key((geneID,geneName)):
            pass
        else:
            geneDict[(geneID,geneName)]={}
            geneDict[(geneID,geneName)]['exons']=[]
        geneDict[(geneID,geneName)]['exons'].append((chr,left,right,strand))
        geneDict[(geneID,geneName)]['gene_type']=gene_type

    outfile = open(outfilename, 'w')

    outline='#geneID\tgeneName\tNumber_exons\tgene_type\n'
    outfile.write(outline)

    for (geneID,geneName) in geneDict.keys():
        geneDict[(geneID,geneName)]['exons']=list(Set(geneDict[(geneID,geneName)]['exons']))
        outline=geneID + '\t' + geneName + '\t' + str(len(geneDict[(geneID,geneName)]['exons']))+'\t'+geneDict[(geneID,geneName)]['gene_type']
        outfile.write(outline+'\n')

    outfile.close()
        
run()

