##################################
#                                #
# Last modified 08/24/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
for sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s gtf outfilename ' % sys.argv[0]
        sys.exit(1)

    gtf = sys.argv[1]
    outputfilename = sys.argv[2]

    outfile=open(outputfilename,'w')

    GeneDict={}
    listoflines = open(gtf)
    i=0
    for line in listoflines:
        i+=1
        if i % 100000 == 0:
            print i
        fields=line.strip().split('\t')
        if line.startswith('#'):
            continue
        if fields[2]=='gene':
            name=fields[8].split('gene_name "')[1].split('";')[0]
            if 'gene_name' not in fields[8]:
                print 'problematic', line.strip()
            if GeneDict.has_key(name):
                pass
            else:
                GeneDict[name]={}
                GeneDict[name]['line']=line
                GeneDict[name]['transcripts']=[]
        elif fields[2]=='transcript':
            transcriptname=fields[8].split('transcript_name "')[1].split('";')[0]
            genename=fields[8].split('gene_name "')[1].split('";')[0]
            if 'transcript_name' not in fields[8]:
                print 'problematic', line.strip()
            GeneDict[genename]['transcripts'].append(transcriptname)
        else:
            continue
    
    for name in GeneDict.keys():
        if len(GeneDict[name]['transcripts'])==1:
            outfile.write(name + '\t' + GeneDict[name]['line'])

    outfile.close()

run()

