##################################
#                                #
# Last modified 11/25/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s ORF outfilename' % sys.argv[0]
        print '       ORF file format: #transcriptName	transcriptType	chr	LeftPos,RightPos	orientation	Start_codon_pos,Stop_codon_pos	RNA_length	protein_length	protein'
        print '       transcriptName format" gene_name + dash + transcript number'
        sys.exit(1)

    ORF = sys.argv[1]
    outputfilename = sys.argv[2]

    listoflines = open(ORF)
    GeneDict={}
    i=0
    for line in listoflines:
        if line.startswith('#'):
            continue
        fields=line.split('\t')
        gene = fields[0].split('-')[0]
        type=fields[1]
        protein=fields[8]
        if GeneDict.has_key((gene,type)):
            pass
        else:
            GeneDict[(gene,type)]=[]
        GeneDict[(gene,type)].append(protein)

        if GeneDict.has_key((gene,type)):
            pass
        else:
            GeneDict[(gene,type)]=[]
        GeneDict[(gene,type)].append(protein)


    outfile = open(outputfilename, 'w')

    outline = '#gene\ttype\tnumber_isoforms\tnumber_proteins'
    outfile.write(outline+'\n')

    for (gene,type) in GeneDict.keys():
        outline = gene + '\t' + type + '\t' + str(len(GeneDict[(gene,type)])) + '\t' + str(len(list(Set(GeneDict[(gene,type)]))))
        outfile.write(outline+'\n')
    
    outfile.close()

run()

