##################################
#                                #
# Last modified 2021/10/11       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import pysam
import random
import string
from sets import Set

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s BAM chrom.sizes barcodes columnID 1|2|3 rev|for outfile [-fullBC]' % sys.argv[0]
        print '\tNote: the position of the barcodes refers to the ligation rounds'
        print '\t\t so if 1 is indicated, but [rev] is listed in the last argument, the script will look at the third sequenced barcode in a reverse complement direction'
        print '\tIt is assumed the read ID look like this: '
        print '\t\tNB551514:646:HM3JHBGXJ:3:13412:21229:17368:::[TAGAACAC+TCAGATTC+TTCGCACC]'
        print '\tuse the [-fullBC] option if you want full barcodes and not just one of the rounds; this option is only compatible with the [for] argument'
        sys.exit(1)

    BAM = sys.argv[1]
    chrominfo=sys.argv[2]
    chromInfoList=[]
    linelist=open(chrominfo)
    for line in linelist:
        fields = line.strip().split('\t')
        chr = fields[0]
        start = 0
        end = int(fields[1])
        chromInfoList.append((chr,start,end))
    BCfile = sys.argv[3]
    BCfileID = int(sys.argv[4])
    BCpos = int(sys.argv[5])
    ForRev = sys.argv[6]
    outfilename = sys.argv[7]

    doFullBC = False
    if '-fullBC' in sys.argv:
        doFullBC = True
        print 'will use full barcodes'

    i=0
    BCDict={}

    linelist=open(BCfile)
    for line1 in linelist:
        fields = line1.strip().split('\t')
        if ForRev == 'rev':
            BC = getReverseComplement(fields[BCfileID])
        if ForRev == 'for':
            BC = fields[BCfileID]
        BCDict[BC] = 1

    samfile = pysam.Samfile(BAM, "rb" )
    outfile = pysam.Samfile(outfilename, "wb", template=samfile)

    WR = 0

    for (chr,start,end) in chromInfoList:
        try:
            for alignedread in samfile.fetch(chr, start, end):
                AAAA = alignedread
                break
        except:
            print 'not found in BAM file', chr, start, end
            continue
        for alignedread in samfile.fetch(chr, start, end):
            i+=1
            if i % 1000000 == 0:
                print 'processed', str(i/1000000) + 'M alignments', chr,start,alignedread.pos,end, 'written', WR, ' alignments'
            fields = str(alignedread).split('\t')
            if doFullBC:
                BC = fields[0].split(':::')[1].replace('[','').replace(']','')
            else:
                BC = fields[0].split(':::')[1].replace('[','').replace(']','').split('+')
            if ForRev == 'for':
                if doFullBC:
                    if BCDict.has_key(BC):
                        outfile.write(alignedread)
                        WR+=1
                else:
                    if BCDict.has_key(BC[BCpos-1]):
                        outfile.write(alignedread)
                        WR+=1
            if ForRev == 'rev':
                if BCDict.has_key(BC[-BCpos]):
                    outfile.write(alignedread)
                    WR+=1


run()
