##################################
#                                #
# Last modified 2022/07/05       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import gzip
from sets import Set
import Levenshtein
import numpy as np
import regex

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s FASTQ.gz UMIlen pos [-separator string mismatches] [-minLen bp]' % sys.argv[0]
        sys.exit(1)


    doSep = False
    if '-separator' in sys.argv:
        separator = sys.argv[sys.argv.index('-separator')+1]
        MM = int(sys.argv[sys.argv.index('-separator')+2])
        doSep = True
        mM3 = regex.compile(separator + '{e<=' + str(MM) + '}')

    minLen = 0
    if '-minLen' in sys.argv:
        minLen = int(sys.argv[sys.argv.index('-minLen')+1])

    FASTQ = sys.argv[1]
    UMIlen = int(sys.argv[2])
    pos = int(sys.argv[3])

    RL = 0
    lineslist = gzip.open(FASTQ)
    for line in lineslist:
        if RL % 4 == 0:
            readID = line.strip()
            RL += 1
            continue
        elif RL % 4 == 1:
            sequence = line.strip()
            RL += 1
            continue
        elif RL % 4 == 2:
            RL += 1
            continue
        elif RL % 4 == 3:
            RL += 1
            QC = line.strip()
            pass

        if doSep:
            if len(mM3.findall(sequence)) > 0:
                pass
            else:
                continue

#            print mM3.findall(sequence)
#            print sequence
#            print sequence.partition(mM3.findall(sequence)[0])
#            print sequence.rpartition(mM3.findall(sequence)[0])[0]

            (constSeq, sepSeq, restSeq) = sequence.partition(mM3.findall(sequence)[0])

            newPos = len(constSeq) + len(sepSeq)

            UMI = sequence[newPos:newPos+UMIlen]

            if len(sequence[newPos+UMIlen:]) > minLen:
                pass
            else:
                continue

            print readID.split(' ')[0][:-1] + '+' + UMI + ']'
            print sequence[newPos+UMIlen:]
            print '+'        
            print QC[newPos+UMIlen:]
        else:
            UMI = sequence[pos:pos+UMIlen]

            if len(sequence[pos+UMIlen:]) > minLen:
                pass
            else:
                continue

            print readID.split(' ')[0][:-1] + '+' + UMI + ']'
            print sequence[pos+UMIlen:]
            print '+'        
            print QC[pos+UMIlen:]

run()
