##################################
#                                #
# Last modified 2025/03/10       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set
import Levenshtein
# import numpy as np

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','X':'X','a':'t','t':'a','g':'c','c':'g','n':'n','x':'x','R':'R','r':'r','M':'M','m':'m','Y':'Y','y':'y','S':'S','s':'s','K':'K','k':'k','W':'W','w':'w'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s R1|R2 starting_spacer_len fixed_position_sequences(pos1:seq1,pos2:seq2) UMIpos:len [-BCedit N] [-revcompBC] ' % sys.argv[0]
        print '\t stream the output of PEFastqToTabDelimited.py, then capture the output of this script with PEFastqToTabDelimited-reverse.py'
        print '\t if the barcode sequence are reverse complemented in the read IDs, use the [-revcompBC] option'
        print '\t the default [-BCedit] edit distance value is 1'
        sys.exit(1)

    R1R2 = sys.argv[1]
    spacer = int(sys.argv[2])
    FixedPositions = []
    fixedposseq = []
    for posseq in sys.argv[3].split(','):
        pos = int(posseq.split(':')[0])
        seq = posseq.split(':')[1]
        FixedPositions.append((pos,seq))
        fixedposseq.append(seq)
    fixedposseq = ''.join(fixedposseq)
    UMIpos = int(sys.argv[4].split(':')[0])
    UMIlen = int(sys.argv[4].split(':')[1])

    BCedit = 1
    if '-BCedit' in sys.argv:
        BCedit = int(sys.argv[sys.argv.index('-BCedit') + 1])

    lineslist = sys.stdin
    for line in lineslist:
        fields = line.strip().split('\t')
        if R1R2 == 'R1':
            BCseq = fields[1]
        if R1R2 == 'R2':
            BCseq = fields[3]
        FPosSeq = {}
        for i in range(spacer + 1):
            FPosSeq[i] = []
        for (pos,seq) in FixedPositions:
            for i in range(spacer + 1):
                FPosSeq[i].append(BCseq[pos+i:pos+i+len(seq)])
        SP = ''
        for i in range(spacer + 1):
            FPosSeq[i] = ''.join(FPosSeq[i])
            if FPosSeq[i] == fixedposseq:
                SP = i
                break
        if SP == '':
            for i in range(spacer + 1):
                LDist = Levenshtein.distance(FPosSeq[i],fixedposseq)
                if LDist <= BCedit:
                    SP = i
                    break
        if SP == '':
            BC = 'nan+BC+BC'
        else:
            BCseq = BCseq[i:]
            BC = ''
            curPos = 0
            for (pos,seq) in FixedPositions:
                BC += BCseq[curPos:pos]
                curPos = pos + len(seq)
            BC += BCseq[curPos:UMIpos]

            BC += '+BC+BC'

        if UMIlen > 0:
            UMI = BCseq[UMIpos:UMIpos+UMIlen]
            BC += '+' + UMI

        newID = fields[0].split(' ')[0] + ':::' + '[' + BC + ']'
        print newID + '\t' + fields[1] + '\t' + fields[2] + '\t' + fields[3] + '\t' + fields[4]
            
run()
