##################################
#                                #
# Last modified 12/16/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s <gtf filename> <expr filename> <outputfilename> <-genes | -transcripts> [-refSeq refFlat.txt]' % sys.argv[0]
        sys.exit(1)
    
    gtf = sys.argv[1]
    expr = sys.argv[2]
    outfilename = sys.argv[3]
    type = sys.argv[4]

    doRefSeq=False
    if '-refSeq' in sys.argv:
        doRefSeq=True
        RefSeqDict={}
        refFlat = sys.argv[sys.argv.index('-refSeq')+1]
        linelist=open(refFlat)
        for line in linelist:
            fields=line.strip().split('\t')
            ID=fields[1]
            name=fields[0]
            RefSeqDict[ID]=name

    ExprDict={}
    GeneDict={}
    TranscriptDict={}
    linelist = open(gtf)
    print 'Inputting annotation'
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if type == '-genes':
            geneID = fields[8].split('gene_id "')[1].split('";')[0]
            if 'gene_name' not in fields[8]:
                gene = fields[8].split('gene_id "')[1].split('";')[0]
                if doRefSeq:
                    try:
                        gene=RefSeqDict[gene]
                    except:
                        pass
            else:
                gene = fields[8].split('gene_name "')[1].split('";')[0]
            if GeneDict.has_key(geneID):
                if GeneDict[geneID]!=gene:
                    print 'problem with', geneID, gene, GeneDict[geneID], 'more than one transcript name for the same transcript ID found'
                else:
                    continue
            else:
                GeneDict[geneID]=gene
        if type == '-transcripts':
            transcriptID = fields[8].split('transcript_id "')[1].split('";')[0]
            if 'transcript_name' not in fields[8]:
                if 'oId' not in fields[8]:
                    transcript = fields[8].split('transcript_id "')[1].split('";')[0]
                else:
                    transcript = fields[8].split('oId "')[1].split('";')[0]
                if doRefSeq:
                    try:
                        transcript=RefSeqDict[transcript]
                    except:
                        pass
            else:
                transcript = fields[8].split('transcript_name "')[1].split('";')[0]
            if TranscriptDict.has_key(transcriptID):
                if TranscriptDict[transcriptID]!=transcript:
                    print 'problem with', transcriptID, transcript, TranscriptDict[transcriptID], 'more than one transcript name for the same transcript ID found'
                else:
                    continue
            else:
                TranscriptDict[transcriptID]=transcript

    print 'found', len(TranscriptDict.keys()), 'transcripts'

    linelist = open(expr)
    print 'Inputting expression values'
    for line in linelist:
        if line.startswith('trans_id'):
            continue
        if line.startswith('gene_id'):
            continue
        fields=line.strip().split('\t')
        ID=fields[0]
        chr=fields[2]
        left=fields[3]
        right=fields[4]
        if type=='-genes':
            FPKM=fields[5]
            FPKM_conf_lo=fields[6]
            FPKM_conf_hi=fields[7]
            status=fields[8]
            ExprDict[ID]=(chr,left,right,FPKM,FPKM_conf_lo,FPKM_conf_hi,status)
        if type=='-transcripts':
            try:
                FPKM=fields[5]
                FPKM_conf_lo=fields[8]
                FPKM_conf_hi=fields[9]
                status=fields[13]
                ExprDict[ID]=(chr,left,right,FPKM,FPKM_conf_lo,FPKM_conf_hi,status)
            except:
                print 'skipping', fields
                continue
            
    outfile=open(outfilename,'w')

    print 'Outputting data'

    problematic=0
    if type=='-genes':
        outfile.write('#gene\tID\tchr\tleft\tright\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tstatus\n')
        for ID in GeneDict:
            gene=GeneDict[ID]
            if ExprDict.has_key(ID):
                (chr,left,right,FPKM,FPKM_conf_lo,FPKM_conf_hi,status)=ExprDict[ID]
            else:
                problematic+=1
                continue
            outline=gene+'\t'+ID+'\t'+chr+'\t'+left+'\t'+right+'\t'+FPKM+'\t'+FPKM_conf_lo+'\t'+FPKM_conf_hi+'\t'+status
            outfile.write(outline+'\n')

    if type=='-transcripts':
        outfile.write('#transcript\tID\tchr\tleft\tright\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tstatus\n')
        for ID in TranscriptDict:
            transcript=TranscriptDict[ID]
            if ExprDict.has_key(ID):
                (chr,left,right,FPKM,FPKM_conf_lo,FPKM_conf_hi,status)=ExprDict[ID]
            else:
                problematic+=1
                continue
            outline=transcript+'\t'+ID+'\t'+chr+'\t'+left+'\t'+right+'\t'+FPKM+'\t'+FPKM_conf_lo+'\t'+FPKM_conf_hi+'\t'+status
            outfile.write(outline+'\n')

    print 'transcripts or genes in annotation but not in expression file: ', problematic

    outfile.close()

run()
