##################################
#                                #
# Last modified 03/23/2014       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import math

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s PSL outfile [-minQSize bp] [-nomulti] [-minMatch bp]' % sys.argv[0]
        sys.exit(1)

    PSL = sys.argv[1]
    GTF = sys.argv[2]

    minQSize = 0
    if '-minQSize' in sys.argv:
        minQSize = int(sys.argv[sys.argv.index('-minQSize') + 1])

    minMatch = 0
    if '-minMatch' in sys.argv:
        minMatch = int(sys.argv[sys.argv.index('-minMatch') + 1])


    NoMulti = False
    if '-nomulti' in sys.argv:
        NoMulti = True

    TranscriptDict = {}
    linelist=open(PSL)
    for line in linelist:
        if line.startswith('#') or line.startswith('psLayout version 3') or line.startswith('match') or line.startswith('     ') or line.startswith('--------------------------') or line == '\n':
            continue
        fields=line.strip().split('\t')
        QName = fields[9]
        QSize = int(fields[10])
        if QSize < minQSize:
            continue
        match = int(fields[0])
        if match < minMatch:
            continue
        mismatch = int(fields[1])
        TName = fields[13]
        TBlocks = fields[18].split(',')
        TStarts = fields[20].split(',')
        QStarts = fields[19].split(',')
        strand = fields[8]
        if TranscriptDict.has_key(QName):
            if TranscriptDict[QName][0]['match'] > match:
                continue
            if TranscriptDict[QName][0]['match'] == match:
                k = len(TranscriptDict[QName].keys())
            if TranscriptDict[QName][0]['match'] < match:
                k = 0
                del TranscriptDict[QName][0]
            TranscriptDict[QName][k] = {}
            TranscriptDict[QName][k]['match'] = match
            TranscriptDict[QName][k]['mismatch'] = mismatch
            TranscriptDict[QName][k]['TName'] = TName
            TranscriptDict[QName][k]['TBlocks'] = TBlocks
            TranscriptDict[QName][k]['TStarts'] = TStarts
            TranscriptDict[QName][k]['QStarts'] = QStarts
            TranscriptDict[QName][k]['strand'] = strand
            TranscriptDict[QName][k]['QSizse'] = QSize
        else:
            TranscriptDict[QName]={}
            TranscriptDict[QName][0]={}
            TranscriptDict[QName][0]['match'] = match
            TranscriptDict[QName][0]['mismatch'] = mismatch
            TranscriptDict[QName][0]['TName'] = TName
            TranscriptDict[QName][0]['TBlocks'] = TBlocks
            TranscriptDict[QName][0]['TStarts'] = TStarts
            TranscriptDict[QName][0]['QStarts'] = QStarts
            TranscriptDict[QName][0]['strand'] = strand
            TranscriptDict[QName][0]['QSizse'] = QSize
            
    outfile = open(GTF, 'w')

    transcripts = TranscriptDict.keys()
    transcripts.sort()

    for QName in transcripts:
        if NoMulti and (len(TranscriptDict[QName].keys()) > 1):
            continue
        for k in TranscriptDict[QName].keys():
            transcriptID = QName + '.alignment-' + str(k)
            chr = TranscriptDict[QName][k]['TName']
            strand = TranscriptDict[QName][k]['strand']
            TBlocks = TranscriptDict[QName][k]['TBlocks'][0:-1]
            TStarts = TranscriptDict[QName][k]['TStarts'][0:-1]
            for i in range(len(TBlocks)):
                ExonStart = int(TStarts[i])
                ExonEnd = ExonStart + int(TBlocks[i])
                outline = chr + '\tPSL\texon\t' + str(ExonStart + 1) + '\t' + str(ExonEnd) + '\t1000\t' + strand + '\t.\ttranscript_id "' + transcriptID + '";'
                outfile.write(outline + '\n')

    outfile.close()
   
run()
