##################################
#                                #
# Last modified 2019/02/11       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
from sets import Set
import random
import Levenshtein

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s end1 end2 adapter_sequence maxMismatches minLength maxLength [-renameReads prefix]' % sys.argv[0]
        print '\tThe script can read compressed files as long as they have the correct suffix - .zip, .bz2 or .gz'
        print '\tYou can have multiple files in each end1 or end2 argument, they have to be comma-separated'
        print '\tit will print to standard output'
        sys.exit(1)

    fastq1files = sys.argv[1].split(',')
    fastq2files = sys.argv[2].split(',')

    if len(fastq1files) != len(fastq1files):
        print 'incorrect number of input files'
        sys.exit(1)

    adapter = sys.argv[3]

    maxMM = int(sys.argv[4])

    minRL = int(sys.argv[5])
    maxRL = int(sys.argv[6])
    
    doRename = False
    if '-renameReads' in sys.argv:
        doRename = True
        prefix = sys.argv[sys.argv.index('-renameReads') + 1]

    i=0
    for f in range(len(fastq1files)):
        fastq1 = fastq1files[f]
        fastq2 = fastq2files[f]
        if fastq1.endswith('.bz2'):
            cmd1 = 'bzip2 -cd ' + fastq1
        elif fastq1.endswith('.gz'):
            cmd1 = 'zcat ' + fastq1
        elif fastq1.endswith('.zip'):
            cmd1 = 'unzip -p ' + fastq1
        else:
            cmd1 = 'cat ' + fastq1
        p1 = os.popen(cmd1, "r")

        if fastq2.endswith('.bz2'):
            cmd2 = 'bzip2 -cd ' + fastq2
        elif fastq2.endswith('.gz'):
            cmd2 = 'gunzip -c ' + fastq2
        elif fastq2.endswith('.zip'):
            cmd1 = 'unzip -p ' + fastq2
        else:
            cmd2 = 'cat ' + fastq2
        p2 = os.popen(cmd2, "r")

        line = 'line'
        lines = []
        while line != '':
            line = p1.readline().strip()
            lines.append(line)
            if line == '':
                break
            i+=1
            if i % 4 == 0:
               ID1 = lines[0]
               seq1 = lines[1]
               q1 = lines[3]
               ID2 = p2.readline().strip()
               seq2 = p2.readline().strip()
               ID22 = p2.readline().strip()
               q2 = p2.readline().strip()
               Pass1 = False
               Pass2 = False
#               print adapter
#               print seq1, seq1.split(adapter)
#               print seq2, seq2.split(adapter)
               if adapter in seq1:
                   seq1 = seq1.split(adapter)[0]
                   if seq1 >= minRL and seq1 < maxRL:
                       Pass1 = True
               else:
                   for j in range(minRL,maxRL):
                       SS = seq1[j:j+len(adapter)]
                       LDist = Levenshtein.distance(adapter,SS)
                       if LDist <= maxMM:
                           Pass1 = True
                           seq1 = seq1[0:j]
                           break
               if adapter in seq2:
                   seq2 = seq2.split(adapter)[0]
                   if seq2 >= minRL and seq2 < maxRL:
                       Pass1 = True
               else:
                   for j in range(minRL,maxRL):
                       SS = seq2[j:j+len(adapter)]
                       LDist = Levenshtein.distance(adapter,SS)
                       if LDist <= maxMM:
                           Pass2 = True
                           seq2 = seq2[0:j]
                           break
               if Pass1 and Pass2:
                   q1 = q1[0:len(seq1)]
                   q2 = q2[0:len(seq2)]
                   if doRename:
                       outline = prefix + str(i/4) + '\t' + seq1 + '\t' + q1 + '\t' + seq2 + '\t' + q2
                   else:
                       outline = ID1[1:len(ID1)].split('/')[0] + '\t' + seq1 + '\t' + q1 + '\t' + seq2 + '\t' + q2
                   print outline
               lines = []

run()