##################################
#                                #
# Last modified 08/11/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outfilename' % sys.argv[0]
        print '	a cuffcompare produced gtf file is assumed' 
        sys.exit(1)

    gtf = sys.argv[1]
    outfilename = sys.argv[2]

    TranscriptDict={}

    lineslist  = open(gtf)
    i=0
    for line in lineslist:
        if i % 100000 == 0:
            print i
        i+=1
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        chr=fields[0]
        left=fields[3]
        right=fields[4]
        strand=fields[6]
        if fields[2]!='exon':
            continue
        transcript=fields[8].split('transcript_id "')[1].split('";')[0]
        class_code=fields[8].split('class_code "')[1].split('";')[0]
        if TranscriptDict.has_key(transcript):
            pass
        else:
            TranscriptDict[transcript]={}
            TranscriptDict[transcript]['exons']=[]
        TranscriptDict[transcript]['exons'].append((chr,left,right,strand))
        TranscriptDict[transcript]['code']=class_code

    outfile = open(outfilename, 'w')

    outline='#Transcript\tclass_code\tNumber_exons\n'
    outfile.write(outline)

    for transcript in TranscriptDict.keys():
        TranscriptDict[transcript]['exons']=list(Set(TranscriptDict[transcript]['exons']))
        outline=transcript+'\t'+str(len(TranscriptDict[transcript]['exons']))+'\t'+TranscriptDict[transcript]['code']
        outfile.write(outline+'\n')

    outfile.close()
        
run()

