##################################
#                                #
# Last modified 7/28/2010        # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s gtf transcript.exp annotation.gtf outputfilename' % sys.argv[0]
        sys.exit(1)

    gtf = sys.argv[1]
    expr = sys.argv[2]
    annotation = sys.argv[3]
    outputfilename = sys.argv[4]

    ExonDict={}
    TranscriptDict={}

    print 'inputing de novo gtf'
    lineslist = open(gtf)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2]!='exon':
            continue
        chr=fields[0]
        left=fields[3]
        right=fields[4]
        strand=fields[6]
        transcript=fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptDict.has_key(transcript):
            TranscriptDict[transcript].append((chr,left,right,strand))
        else:
            TranscriptDict[transcript]=[]
            TranscriptDict[transcript].append((chr,left,right,strand))

    print 'compiling exons list'
    for transcript in TranscriptDict.keys():
        SingleExon=False
        TranscriptDict[transcript].sort()
        if TranscriptDict[transcript][0][3]=='-':
            TranscriptDict[transcript].reverse()
        if len(TranscriptDict[transcript])==1:
            SingleExon=True
        for (chr,left,right,strand) in TranscriptDict[transcript]:
            Type='?'
            if SingleExon:
                Type='S'
            elif TranscriptDict[transcript].index((chr,left,right,strand))==0:
                Type='F'
            elif TranscriptDict[transcript].index((chr,left,right,strand))==len(TranscriptDict[transcript])-1:
                Type='L'
            else:
                Type='M'
            if ExonDict.has_key((chr,left,right,strand)):
                pass
            else:
                ExonDict[(chr,left,right,strand)]={}
            ExonDict[(chr,left,right,strand)]['transcripts']=[]
            ExonDict[(chr,left,right,strand)]['type']=[]
            ExonDict[(chr,left,right,strand)]['match']=[]
            ExonDict[(chr,left,right,strand)]['type'].append(Type)
            ExonDict[(chr,left,right,strand)]['transcripts'].append(transcript)
        
    print 'inputing expression values'
    ExpressionDict={}
    lineslist = open(expr)
    for line in lineslist:
        fields = line.strip().split('\t')
        transcript=fields[0]
        FPKM=fields[5]
        FPKM_conf_lo=fields[8]
        FPKM_conf_hi=fields[9]
        ExpressionDict[transcript]=(FPKM,FPKM_conf_lo,FPKM_conf_hi)

    print 'inputing annotation'
    AnnotationTranscriptDict={}
    AnnotationExonDict={}
    lineslist = open(annotation)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2]!='exon':
            continue
        chr=fields[0]
        left=fields[3]
        right=fields[4]
        strand=fields[6]
        transcript=fields[8].split('transcript_id "')[1].split('";')[0]
        if AnnotationTranscriptDict.has_key(transcript):
            AnnotationTranscriptDict[transcript].append((chr,left,right,strand))
        else:
            AnnotationTranscriptDict[transcript]=[]
            AnnotationTranscriptDict[transcript].append((chr,left,right,strand))

    print 'compiling exons list'
    for transcript in AnnotationTranscriptDict.keys():
        SingleExon=False
        AnnotationTranscriptDict[transcript].sort()
        if AnnotationTranscriptDict[transcript][0][3]=='-':
            AnnotationTranscriptDict[transcript].reverse()
        if len(AnnotationTranscriptDict[transcript])==1:
            SingleExon=True
        for (chr,left,right,strand) in AnnotationTranscriptDict[transcript]:
            Type='?'
            if SingleExon:
                Type='S'
            elif AnnotationTranscriptDict[transcript].index((chr,left,right,strand))==0:
                Type='F'
            elif AnnotationTranscriptDict[transcript].index((chr,left,right,strand))==len(AnnotationTranscriptDict[transcript])-1:
                Type='L'
            else:
                Type='M'
            if AnnotationExonDict.has_key((chr,left,right,strand)):
                pass
            else:
                AnnotationExonDict[(chr,left,right,strand)]={}
            AnnotationExonDict[(chr,left,right,strand)]['transcripts']=[]
            AnnotationExonDict[(chr,left,right,strand)]['type']=[]
            AnnotationExonDict[(chr,left,right,strand)]['type'].append(Type)
            AnnotationExonDict[(chr,left,right,strand)]['transcripts'].append(transcript)

    print 'comparing to annotaiton'
    NoMatchList=[]
    for (chr,left,right,strand) in ExonDict.keys():
        if AnnotationExonDict.has_key((chr,left,right,strand)):
            ExonDict[(chr,left,right,strand)]['match'].append('perfect')
        else:
            NoMatchList.append((chr,left,right,strand))

    AnnotationLeftDict={}
    AnnotationRightDict={}
    for (chr,left,right,strand) in AnnotationExonDict.keys():
        AnnotationLeftDict[(chr,left,strand)]=''
        AnnotationRightDict[(chr,right,strand)]=''

    for (chr,left,right,strand) in NoMatchList:
        if AnnotationRightDict.has_key((chr,right,strand)):
            ExonDict[(chr,left,right,strand)]['match'].append('right')
        elif AnnotationLeftDict.has_key((chr,left,strand)):
            ExonDict[(chr,left,right,strand)]['match'].append('left')
        else:
            ExonDict[(chr,left,right,strand)]['match'].append('no_match')

    outfile = open(outputfilename, 'w')
    outline=('#chr\tleft\tright\tstrand\tType\tMatch\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tTranscripts\n')
    outfile.write(outline)

    ExonsList=[]
    for (chr,left,right,strand) in ExonDict.keys():
        ExonsList.append((chr,int(left),int(right),strand))
    ExonsList.sort()

    for (chr,left,right,strand) in ExonsList:
        left=str(left)
        right=str(right)
        outline=chr+'\t'+left+'\t'+right+'\t'+strand+'\t'
        types=list(Set(ExonDict[(chr,left,right,strand)]['type']))
        matches=list(Set(ExonDict[(chr,left,right,strand)]['match']))
        for type in types:
            outline=outline+type+','
        outline=outline[0:-1]+'\t'
        for match in matches:
            outline=outline+match+','
        Expression=0
        Expression_conf_lo=0
        Expression_conf_hi=0
        for transcript in ExonDict[(chr,left,right,strand)]['transcripts']:
            (FPKM,FPKM_conf_lo,FPKM_conf_hi)=ExpressionDict[transcript]
            Expression_conf_lo=float(Expression_conf_lo)+float(FPKM_conf_lo)
            Expression_conf_hi=float(Expression_conf_hi)+float(FPKM_conf_hi)
            Expression=float(Expression)+float(FPKM)
        outline=outline[0:-1]+'\t'+str(Expression)+'\t'+str(Expression_conf_lo)+'\t'+str(Expression_conf_hi)+'\t'
        for transcript in ExonDict[(chr,left,right,strand)]['transcripts']:
            outline=outline+transcript+'|'+ExpressionDict[transcript][0]+'|'+ExpressionDict[transcript][1]+'|'+ExpressionDict[transcript][2]+','
        outfile.write(outline[0:-1]+'\n')
    outfile.close()

run()

