##################################
#                                #
# Last modified 2019/03/23       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import string
import Levenshtein

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s inputfilename SL maximum_mismatches trim_bp minLen'% sys.argv[0]
        print '\tUse - to specify standard input; the script will print to standard output by default'
        print '\tthe script can read .gz and .bz2 files'
        print '\tthe trim_bp parameter refers to how many bases should be retained after the SL sequence'
        print '\tthe minLen parameter refers to the minimal length of reads after trimming'
        sys.exit(1)

    inputfilename = sys.argv[1]
    SL = sys.argv[2].upper()
    MM = int(sys.argv[3])
    trim = int(sys.argv[4])
    minL = int(sys.argv[5])

    doStdInput = False
    if inputfilename == '-':
        doStdInput = True

    doStdIn = False
    if inputfilename != '-':
        if inputfilename.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + inputfilename
        elif inputfilename.endswith('.gz'):
            cmd = 'gunzip -c ' + inputfilename
        else:
            cmd = 'cat ' + inputfilename
        p = os.popen(cmd, "r")
    else:
        doStdIn = True

    line = 'line'
    i=1
    while line != '':
        if doStdIn:
            line = sys.stdin.readline()
        else:
            line = p.readline()
        if line == '':
            break
        if i==1 and line[0]=='@':
            ID=line.strip().replace(' ','_')
            i=2
            continue
        if i==2:
            i=3
            sequence = line.strip().upper()
            continue
        if i==3 and line[0]=='+':
            plus='+\n'
            i=4
            continue
        if i==4:
            i=1
            scores = line.strip()
            hasSL = False
            if SL in sequence:
                SLend = len(sequence.split(SL)[0]) + len(SL)
                hasSL = True
                newseq = sequence[SLend:]
                newscores = sequence[SLend:]
            else:
                for k in range(len(sequence) - len(SL) - minL):
                    seqlet = sequence[k:k+len(SL)]
                    LDist = Levenshtein.distance(seqlet,SL)
                    if LDist <= MM:
                        SLend = k + len(SL)
                        hasSL = True
                        newseq = sequence[SLend:]
                        newscores = sequence[SLend:]
            if hasSL:
                if len(newseq) >= minL:
                    pass
                else:
                    continue
                ID = ID + '__SLpresent'
                print ID
                if len(newseq) > trim:
                    print newseq[0:trim]
                    print '+'
                    print newscores[0:trim]
                else:
                    print newseq
                    print '+'
                    print newscores
            else:
                if len(sequence) >= minL:
                    pass
                else:
                    continue
                print ID
                if len(sequence) > trim:
                    print sequence[0:trim]
                    print '+'
                    print scores[0:trim]
                else:
                    print sequence
                    print '+'
                    print scores

run()

