##################################
#                                #
# Last modified 03/18/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s junctions GTF outfilename' % sys.argv[0]
        print '   junctions format:'
        print '   chrY	9175621	9196544	+	known exon to known exon, different genes	55.0	TSPY4	TSPY8	novel	GT|AG'
        sys.exit(1)

    junctions = sys.argv[1]
    GTF = sys.argv[2]
    outfilename = sys.argv[3]

    GeneDict={}

    lineslist = open(junctions)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        gene1 = fields[6]
        gene2 = fields[7]
        GeneDict[gene1]=''
        GeneDict[gene2]=''

    print 'finished inputting junctions'

    lineslist = open(GTF)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if fields[2] != 'exon':
            continue
        geneName = fields[8].split('gene_name "')[1].split('"')[0]
        if GeneDict.has_key(geneName):
            pass
        else:
            continue 
        GeneDict[geneName]=fields[8].split('gene_type "')[1].split('"')[0]

    print 'finished inputting GTF'

    outfile = open(outfilename, 'w')

    lineslist = open(junctions)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        gene1 = fields[6]
        gene2 = fields[7]
        outline = line.strip() + '\t' + GeneDict[gene1] + '\t' + GeneDict[gene2] + '\n'
        outfile.write(outline)

    outfile.close()

run()
