##################################
#                                #
# Last modified 12/30/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outputfilename [-biotypeInField1]' % sys.argv[0]
        sys.exit(1)

    doBiotypeField1=False
    if '-biotypeInField1' in sys.argv:
        doBiotypeField1=True
    
    GTF = sys.argv[1]
    outfilename = sys.argv[2]

    GeneDict={}
    TranscriptDict={}
    linelist=open(GTF)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        geneID=fields[8].split('gene_id "')[1].split('";')[0]
        transcriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if doBiotypeField1:
            GeneBioType=fields[1]
            TranscriptBioType=fields[1]
        else:
            if 'gene_type "' in fields[8]:
                GeneBioType=fields[8].split('gene_type "')[1].split('";')[0]
            else:
                GeneBioType = 'unspecified'
            if 'transcript_type "' in fields[8]:
                TranscriptBioType=fields[8].split('transcript_type "')[1].split('";')[0]
            else:
                TranscriptBioType = 'unspecified'
        if GeneDict.has_key(GeneBioType):
            pass
        else:
            GeneDict[GeneBioType]={}
            GeneDict[GeneBioType]['genes']=[]
            GeneDict[GeneBioType]['transcripts']=[]
        if TranscriptDict.has_key(TranscriptBioType):
            pass
        else:
            TranscriptDict[TranscriptBioType]={}
            TranscriptDict[TranscriptBioType]['genes']=[]
            TranscriptDict[TranscriptBioType]['transcripts']=[]
        TranscriptDict[TranscriptBioType]['transcripts'].append(transcriptID)
        GeneDict[GeneBioType]['genes'].append(geneID)
        TranscriptDict[TranscriptBioType]['genes'].append(geneID)
        GeneDict[GeneBioType]['transcripts'].append(transcriptID)


    outfile = open(outfilename, 'w')
    outfile.write('#Genes:\n')
    outfile.write('#BioType\tGenes\tTranscripts:\n')
    biotypes=GeneDict.keys()
    biotypes.sort()
    for biotype in biotypes:
        GeneDict[biotype]['genes']=list(Set(GeneDict[biotype]['genes']))
        GeneDict[biotype]['transcripts']=list(Set(GeneDict[biotype]['transcripts']))
        outline=biotype + '\t' + str(len(GeneDict[biotype]['genes'])) + '\t' + str(len(GeneDict[biotype]['transcripts']))
        outfile.write(outline+'\n')
    outfile.write('#Transcripts:\n')
    outfile.write('#BioType\tGenes\tTranscripts:\n')
    biotypes=TranscriptDict.keys()
    biotypes.sort()
    for biotype in biotypes:
        TranscriptDict[biotype]['genes']=list(Set(TranscriptDict[biotype]['genes']))
        TranscriptDict[biotype]['transcripts']=list(Set(TranscriptDict[biotype]['transcripts']))
        outline=biotype + '\t' + str(len(TranscriptDict[biotype]['genes'])) + '\t' + str(len(TranscriptDict[biotype]['transcripts']))
        outfile.write(outline+'\n')
   
    outfile.close()
   
run()
