##################################
#                                #
# Last modified 2018/08/14       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s <inputfilename> config outprefix' % sys.argv[0]
        print '\tConfig_format: <label> <barcode>'
        print '\tCthe script assumes it is the first N bases (depending on the length) of barcodes that are the barcode'
        print '\tCthe script assumes the input is the output of PEFastqToTabDelimited.py'
        sys.exit(1)

    inputfilename = sys.argv[1]
    outprefix = sys.argv[3]

    config = sys.argv[2]
    BCDict = {}
    linelist = open(config)
    for line in linelist:
        fields = line.strip().split('\t')
        BC = fields[1]
        label = fields[0]
        BCDict[BC] = open(outprefix + '.' + label + '.fastq', 'w')

    N = len(BC)

    doStdIn = False
    if inputfilename != '-':
        if inputfilename.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + inputfilename
        elif inputfilename.endswith('.gz'):
            cmd = 'gunzip -c ' + inputfilename
        else:
            cmd = 'cat ' + inputfilename
        p = os.popen(cmd, "r")
    else:
        doStdIn = True

    line = 'line'

    k = 0
    p = 0
    i = 1
    while line != '':
        if doStdIn:
            line = sys.stdin.readline()
        else:
            line = p.readline()
        if line == '':
            break
        k += 1
        if k % 4000000 == 0:
            print k/4, 'lines processed', p, 'demultiplexed reads printed out'
        fields = line.strip().split('\t')
        ID = fields[0]
        seq = fields[1]
        scores = fields[2]
        try:
            BC = fields[3][0:N]
        except:
            print fields
            sys.exit(1)
        if BCDict.has_key(BC):
            outline = '@' + ID + '\n'
            BCDict[BC].write(outline)
            outline = seq + '\n'
            BCDict[BC].write(outline)
            outline = '+' + '\n'
            BCDict[BC].write(outline)
            outline = scores + '\n'
            BCDict[BC].write(outline)
            p += 1

    for BC in BCDict.keys():
        BCDict[BC].close()

run()

