##################################
#                                #
# Last modified 11/23/2009       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s knownGene outfilename' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    outputfilename = sys.argv[2]

    listoflines = open(inputfilename)
    lineslist = listoflines.readlines()
    outfile = open(outputfilename, 'w')
    exonList=[]
    j=0
    for line in lineslist:
        if j % 1000 == 0:
            print j
        j+=1
        fields=line.strip().split('\t')
        exonStarts=fields[8].split(',')        
        exonEnds=fields[9].split(',')        
        if len(exonStarts)<4:
            continue
        for i in range(1,len(exonStarts)-2):
            exonList.append((int(exonStarts[i]),int(exonEnds[i])))
    exonList=list(Set(exonList))
    ExonLengthDict={}
    for (start,end) in exonList:
         length=end-start
         if ExonLengthDict.has_key(length):
             ExonLengthDict[length]+=1
         else:
             ExonLengthDict[length]=1
    keys=ExonLengthDict.keys()
    keys.sort()
    for length in keys:
        outline=str(length)+'\t'+str(ExonLengthDict[length])+'\n'
        outfile.write(outline)
    outfile.close()

run()

