##################################
#                                #
# Last modified 2025/07/25       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import gzip
import subprocess
import regex

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s fastq adapter_sequence [-MM number] [-5padapter sequence] [-splitReadID]' % sys.argv[0]
        print '\tUse - instead of a fastq name if you want to capture standard input'
        print '\tthe script can read gzipped files directly'
        print '\tthe script will print to standard output by default'
        print '\tfor 3p end trimming, the script will find the first occurence of the first X bp of the adapter and trim the read there'
        print '\tfor 5p end trimming, the script will find the last occurence of the adapter sequene and trim the read there'
        print '\tUse the -MM option if you want to allow mismatches in either adapter'
        sys.exit(1)

    fastq = sys.argv[1]
    adapter3 = sys.argv[2]
    adapter3length = len(adapter3)

    do5p = False
    if '-5padapter' in sys.argv:
        do5p = True
        adapter5 = sys.argv[sys.argv.index('-5padapter')+1]
        adapter5length = len(adapter5)

    doSplitReadId = False
    if '-splitReadID' in sys.argv:
        doSplitReadId = True

    MM = 0
    if '-MM' in sys.argv:
        doMM = True
        MM = int(sys.argv[sys.argv.index('-MM')+1])
        mM3 = regex.compile(adapter3 + '{e<=' + str(MM) + '}')
        if do5p:
            mM5 = regex.compile(adapter5 + '{e<=' + str(MM) + '}')

    if fastq == '-':
        lineslist  = sys.stdin
    elif fastq.endswith('.gz'):
        lineslist  = gzip.open(fastq)
    else:
        lineslist  = open(fastq)
    i=1
    for line in lineslist:
        if i == 1 and line[0]!='@':
            print 'fastq file broken, exiting'
            sys.argv(1)
        if i == 1 and line[0]=='@':
            readID = line.strip()
            if doSplitReadId:
                readID = readID.split(' ')[0]
            print '>' + readID[1:]
            i=2
            continue
        if i == 2:
            sequence = line.strip()
            FoundFirst3 = False
            if len(mM3.findall(sequence)) > 0:
                newsequence = sequence.rpartition(mM3.findall(sequence)[-1])[0]
#                print newsequence, 'ns'
#                print sequence, 'ogs'
            else:
                newsequence = ''
            if newsequence != '':
                sequence = newsequence
            trimPos3 = len(sequence)
#            print 'current sequence:', sequence
            if do5p:
#                print mM5.findall(sequence)
                if len(mM5.findall(sequence)) > 0:
                    newsequence = sequence.partition(mM5.findall(sequence)[0])[-1]
#                    print newsequence
#                    print sequence
#                    print '..........'
                else:
                    newsequence = sequence
            print newsequence
            i=3
            continue
        if i == 3 and line[0]=='+':
            i = 4
            continue
        if i == 4:
            i=1
            continue

run()

