##################################
#                                #
# Last modified 11/22/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s <junctions file> <known junctions file> <transcript models gtf filename> <outfilename>' % sys.argv[0]
        print '	Junctions file should already contain the cannonical/non-cannonical junctions type informaiton'
        print '	chr	left	right	strand	5-exon 5-intron|3-intron 3-exon'
        print '	known junctions format should be as follows:'
        print '	chr	left	right	strand'
        sys.exit(1)

    junctions = sys.argv[1]
    knownjunctions = sys.argv[2]
    gtf = sys.argv[3]
    outfilename = sys.argv[4]

    knownJunctionsDict={}
    linelist=open(knownjunctions)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        try:
            chr=fields[0]
            left=int(fields[1])
            right=int(fields[2])
            strand=fields[3]
        except:
            print 'skipping:', line.strip()
            continue
        knownJunctionsDict[(chr,left,right,strand)]=''
        knownJunctionsDict[(chr,left,right-1,strand)]=''
        knownJunctionsDict[(chr,left,right+1,strand)]=''
        knownJunctionsDict[(chr,left-1,right-1,strand)]=''
        knownJunctionsDict[(chr,left-1,right,strand)]=''
        knownJunctionsDict[(chr,left-1,right+1,strand)]=''
        knownJunctionsDict[(chr,left+1,right-1,strand)]=''
        knownJunctionsDict[(chr,left+1,right,strand)]=''
        knownJunctionsDict[(chr,left+1,right+1,strand)]=''
        knownJunctionsDict[(chr,left+2,right-1,strand)]=''
        knownJunctionsDict[(chr,left+2,right,strand)]=''
        knownJunctionsDict[(chr,left+2,right+1,strand)]=''

    JunctionDict={}

    linelist=open(junctions)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        try:
            chr=fields[0]
            left=int(fields[1])
            right=int(fields[2])
            strand=fields[3]
        except:
            print 'skipping:', line.strip()
            continue
        if JunctionDict.has_key(chr):
            pass
        else:
            JunctionDict[chr]={}
        JunctionDict[chr][(chr,left,right,strand)]={}
        JunctionDict[chr][(chr,left,right,strand)]['line']=line.strip()
        if knownJunctionsDict.has_key((chr,left,right,strand)):
            JunctionDict[chr][(chr,left,right,strand)]['novelty']='known'
        else:
            JunctionDict[chr][(chr,left,right,strand)]['novelty']='novel'

    print 'finished importing junctions'

    GeneDict={}
    ExonEndDict={}

    linelist=open(gtf)
    k=0
    for line in linelist:
        k+=1
        if k % 10000 == 0:
            print k, 'lines processed'
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        strand=fields[6]
        if ExonEndDict.has_key(chr):
            pass
        else:
            ExonEndDict[chr]={}
        geneName=fields[8].split('gene_name "')[1].split('";')[0]
        if fields[2]=='gene':
            if GeneDict.has_key(chr):
                pass
            else:
                GeneDict[chr]={}
            GeneDict[chr][geneName]=(left,right)
            continue
        if fields[2]!='exon':
            continue
        ExonEndDict[chr][left]=(geneName,strand,'left')
        ExonEndDict[chr][right]=(geneName,strand,'right')
        ExonEndDict[chr][left+1]=(geneName,strand,'left')
        ExonEndDict[chr][right+1]=(geneName,strand,'right')
        ExonEndDict[chr][left-1]=(geneName,strand,'left')
        ExonEndDict[chr][right-1]=(geneName,strand,'right')
        ExonEndDict[chr][left-2]=(geneName,strand,'left')
        ExonEndDict[chr][right-2]=(geneName,strand,'right')
        ExonEndDict[chr][left+2]=(geneName,strand,'left')
        ExonEndDict[chr][right+2]=(geneName,strand,'right')



    for chr in JunctionDict.keys():
        print chr
        for (chr,left,right,strand) in JunctionDict[chr].keys():
            if ExonEndDict[chr].has_key(left):
                (leftGene,leftGeneStrand,leftPos)= ExonEndDict[chr][left]
            else:
                internalTo=''
                for gene in GeneDict[chr].keys():
                    if left >= GeneDict[chr][gene][0] and left <= GeneDict[chr][gene][1]:
                        internalTo=internalTo+gene+','
                if internalTo=='':
                    leftGene='intergenic'
                else:
                    leftGene=internalTo[0:-1]
            if ExonEndDict[chr].has_key(right):
                (rightGene,rightGeneStrand,rightPos)= ExonEndDict[chr][right]
            else:
                internalTo=''
                for gene in GeneDict[chr].keys():
                    if right >= GeneDict[chr][gene][0] and right <= GeneDict[chr][gene][1]:
                        internalTo=internalTo+gene+','
                if internalTo=='':
                    rightGene='intergenic'
                else:
                    rightGene=internalTo[0:-1]
            if leftGene=='intergenic' and rightGene=='intergenic':
                JunctionDict[chr][(chr,left,right,strand)]['type']='intergenic'
                JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                continue
            elif leftGene=='intergenic':
                if ExonEndDict[chr].has_key(right):
                    ExonGeneRight=ExonEndDict[chr][right][0]
                    if ExonGeneRight!=rightGene:
                        print 'ExonGeneRight!=rightGene'
                        print ExonGeneRight, rightGene
                        print chr,left,right,strand
                    JunctionDict[chr][(chr,left,right,strand)]['type']='intergenic to known exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                else:
                    JunctionDict[chr][(chr,left,right,strand)]['type']='intergenic to unknown internal exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                continue
            elif rightGene=='intergenic':
                if ExonEndDict[chr].has_key(left):
                    ExonGeneLeft=ExonEndDict[chr][left][0]
                    if ExonGeneLeft!=leftGene:
                        print 'ExonGeneLeft!=leftGene'
                        print ExonGeneLeft, leftGene
                        print chr,left,right,strand
                    JunctionDict[chr][(chr,left,right,strand)]['type']='intergenic to known exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                else:
                    JunctionDict[chr][(chr,left,right,strand)]['type']='intergenic to unknown internal exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                continue
            elif leftGene == rightGene:
                if ExonEndDict[chr].has_key(left) and ExonEndDict[chr].has_key(right):
                    JunctionDict[chr][(chr,left,right,strand)]['type']='known exon to known exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    continue
                elif ExonEndDict[chr].has_key(left) or ExonEndDict[chr].has_key(right):
                    JunctionDict[chr][(chr,left,right,strand)]['type']='known exon to unknown internal exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    continue
                else:
                    JunctionDict[chr][(chr,left,right,strand)]['type']='unknown internal exon to unknown internal exon'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
            elif leftGene != rightGene:
                if ExonEndDict[chr].has_key(left) and ExonEndDict[chr].has_key(right):
                    JunctionDict[chr][(chr,left,right,strand)]['type']='known exon to known exon, different genes'
                    JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    continue
                elif ExonEndDict[chr].has_key(left) or ExonEndDict[chr].has_key(right):
                    if leftGene not in rightGene and rightGene not in rightGene:
                        JunctionDict[chr][(chr,left,right,strand)]['type']='known exon to unknown internal exon, different genes'
                        JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    else:
                        JunctionDict[chr][(chr,left,right,strand)]['type']='known exon to unknown internal exon'
                        JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    continue
                else:
                    if ',' not in rightGene and ',' not in leftGene:
                        JunctionDict[chr][(chr,left,right,strand)]['type']='unknown internal exon to unknown internal exon, different genes'
                        JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                    else:
                        leftGeneList=leftGene.split(',')
                        rightGeneList=rightGene.split(',')
                        LikelySameGene=False
                        for LG in leftGeneList:
                            if LG in rightGeneList:
                                LikelySameGene=True
                        if LikelySameGene:
                            JunctionDict[chr][(chr,left,right,strand)]['type']='unknown internal exon to unknown internal exon'
                            JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)
                        else:
                            JunctionDict[chr][(chr,left,right,strand)]['type']='unknown internal exon to unknown internal exon, different genes'
                            JunctionDict[chr][(chr,left,right,strand)]['genes']=(leftGene,rightGene)

    outfile=open(outfilename,'w')

    chrkeys=JunctionDict.keys()
    chrkeys.sort()
    
    for chr in chrkeys:
        keys=JunctionDict[chr].keys()
        keys.sort()
        for (chr,left,right,strand) in keys:
            outline=JunctionDict[chr][(chr,left,right,strand)]['line']+'\t'+JunctionDict[chr][(chr,left,right,strand)]['novelty']+'\t'+JunctionDict[chr][(chr,left,right,strand)]['type']+'\t'+JunctionDict[chr][(chr,left,right,strand)]['genes'][0]+'\t'+JunctionDict[chr][(chr,left,right,strand)]['genes'][1]
            if JunctionDict[chr][(chr,left,right,strand)]['novelty']=='known' and ('unknown' in JunctionDict[chr][(chr,left,right,strand)]['type'] or 'intergenic' in JunctionDict[chr][(chr,left,right,strand)]['type'] or 'different' in JunctionDict[chr][(chr,left,right,strand)]['type']):
                print outline
            outfile.write(outline+'\n')
    outfile.close()

run()

