##################################
#                                #
# Last modified 2022/03/24       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import gzip
from sets import Set
import Levenshtein
import numpy as np

def getReverseComplement(preliminarysequence):
    
    DNA = {'-':'-', '+':'+', 'A':'T','T':'A','G':'C','C':'G','N':'N','X':'X','a':'t','t':'a','g':'c','c':'g','n':'n','x':'x','R':'R','r':'r','M':'M','m':'m','Y':'Y','y':'y','S':'S','s':'s','K':'K','k':'k','W':'W','w':'w'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s FASTQ.gz BCpositions(comma-separated) barcodes fieldID [-revcomp]' % sys.argv[0]
        print '\tNote: the script currently assumes :::[BC1+BC2+BC3+...+BCN] format'
        sys.exit(1)

    doRC = False
    if '-revcomp' in sys.argv:
        doRC = True

    FASTQ = sys.argv[1]
    barcodes = sys.argv[3]
    fieldID = int(sys.argv[4])

    BCpositions = []
    for pos in sys.argv[2].split(','):
        BCpositions.append(int(pos))
#    BCpositions.sort()

    BCDict={}
    lineslist = open(barcodes)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        BC = fields[fieldID]
        if doRC:
            BC = getReverseComplement(BC)
        BCDict[BC] = 1

    RL = 0
    lineslist = gzip.open(FASTQ)
    for line in lineslist:
        if RL % 4 == 0:
            readID = line.strip()
            RL += 1
            continue
        elif RL % 4 == 1:
            sequence = line.strip()
            RL += 1
            continue
        elif RL % 4 == 2:
            RL += 1
            continue
        elif RL % 4 == 3:
            RL += 1
            QC = line.strip()
            pass

        BCfields = readID.split(':::')[1].split(' ')[0].replace('[','').replace(']','').split('+')
        BC = ''
        for pos in BCpositions:
            BC += BCfields[pos] + '+'
        BC = BC[:-1]

        if BCDict.has_key(BC):
            pass
        else:
            continue

        print readID
        print sequence
        print '+'        
        print QC

run()
