##################################
#                                #
# Last modified 2019/04/03       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import string
import Levenshtein

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T',
           'T':'A',
           'G':'C',
           'C':'G',
           'N':'N',
           'X':'N',
           'a':'t',
           't':'a',
           'g':'c',
           'c':'g',
           'n':'N',
           'x':'N',
           'R':'N',
           'r':'N',
           'M':'N',
           'm':'N',
           'Y':'N',
           'y':'N',
           'S':'N',
           's':'N',
           'K':'N',
           'k':'N',
           'W':'N',
           'w':'N',
           'H':'N',
           'h':'N',
           'V':'N',
           'v':'N',
           'B':'N',
           'b':'N',
           'D':'N',
           'd':'N'
            }
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s fasta SL maximum_mismatches minLen outfilename [-doInternal subsequence]'% sys.argv[0]
        print '\tUse - to specify standard input'
        print '\tthe script can read .gz and .bz2 files'
#        print '\tthe trim_bp parameter refers to how many bases should be retained after the SL sequence'
        print '\tthe minLen parameter refers to the minimal length of reads after trimming'
        sys.exit(1)

    fasta = sys.argv[1]
    SL = sys.argv[2].upper()
    MM = int(sys.argv[3])
    minL = int(sys.argv[4])
    outfilename = sys.argv[5]

    doInternal = False
    if '-doInternal' in sys.argv:
        doInternal = True
        SLSubSeq = sys.argv[sys.argv.index('-doInternal') + 1]
        revSLSubSeq = getReverseComplement(SLSubSeq)
        print 'will look for SL matches internally, using the following subsequence', SLSubSeq

    SeqDict = {}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                SeqDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    SeqDict[chr] = ''.join(sequence)

    print 'finished inputting sequence'

    revSL = getReverseComplement(SL)

    outfile = open(outfilename,'w')
    outline = '#transcript\tleft\tright\tmismatches'
    outfile.write(outline + '\n')

    T = 0
    for ID in SeqDict.keys():
        T += 1
        if T % 100 == 0:
            print 'processed', T, 'transcript'
        TSeq = SeqDict[ID] 
        revTSeq = getReverseComplement(TSeq)
        for i in range(len(SL) - minL):
            SLseqlet = SL[i:]
            LMM = Levenshtein.distance(TSeq[0:len(SL)-i],SLseqlet)
            if LMM <= MM:
                outline = ID + '\t' + '5p'  + '\t' + '0' + '\t' + str(len(SLseqlet)) + '\t' + str(LMM)
                outfile.write(outline + '\n')
                break
        for i in range(len(revSL) - minL):
            SLseqlet = revSL[i:]
            LMM = Levenshtein.distance(TSeq[0:len(SL)-i],SLseqlet)
            if LMM <= MM:
                outline = ID + '\t' + '3p'  + '\t' + str(len(TSeq) - i) + '\t' + str(len(TSeq)) + '\t' + str(LMM)
                outfile.write(outline + '\n')
                break
        if doInternal:
            for i in range(len(SL) - minL,len(TSeq) - len(SL) - minL):
                seqlet = TSeq[i:i+len(SLSubSeq)]
                if Levenshtein.distance(SLSubSeq,seqlet) <= MM:
                    outline = ID + '\t' + 'internal_for'  + '\t' + str(i) + '\t' + str(i + len(SLSubSeq)) + '\t' + str(Levenshtein.distance(SLSubSeq,seqlet))
                    outfile.write(outline + '\n')
                elif Levenshtein.distance(revSLSubSeq,seqlet) <= MM:
                    outline = ID + '\t' + 'internal_rev'  + '\t' + str(i) + '\t' + str(i + len(SLSubSeq)) + '\t' + str(Levenshtein.distance(revSLSubSeq,seqlet))
                    outfile.write(outline + '\n')
                else:
                    pass

    outfile.close()

run()

