##################################
#                                #
# Last modified 2023/08/30       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import gzip
import string
from sets import Set

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s fileslist outfilename' % sys.argv[0]
        sys.exit(1)

    filelist = sys.argv[1]
    outfilename = sys.argv[2]

    BCDict = {}

    lineslist = open(filelist)
    for line in lineslist:
        if line.startswith('#'):
            continue
        file = line.strip().split('\t')[0]
        print file
        if file.endswith('.gz'):
            lines = gzip.open(file)
        else:
            lines = open(file)
        for LL in lines:
            if LL.startswith('#'):
                continue
            fields = LL.strip().split('\t')
            BC = fields[0]
            sgRNA = fields[1]
            label = fields[2]
            if fields[3] == 'nan':
                UMIs = 0
            else:
                UMIs = int(fields[3])
            if fields[4] == 'nan':
                reads = 0
            else:
                reads = int(fields[4])
            if BCDict.has_key(BC):
                pass
            else:
                BCDict[BC] = {}
            if BCDict[BC].has_key((sgRNA,label)):
                pass
            else:
                BCDict[BC][(sgRNA,label)] = {}
                BCDict[BC][(sgRNA,label)]['UMI'] = 0
                BCDict[BC][(sgRNA,label)]['reads'] = 0
            BCDict[BC][(sgRNA,label)]['UMI'] += UMIs
            BCDict[BC][(sgRNA,label)]['reads'] += reads

    outfile = open(outfilename, 'w')
    outline = '#barcode\tsgRNA\tlabel\tUMIs\treads'
    outfile.write(outline + '\n')

    barcodes = BCDict.keys()
    barcodes.sort()

    for BC in barcodes:
        for (sgRNA,label) in BCDict[BC].keys():
            outline = BC + '\t' + sgRNA + '\t' + label
            outline = outline + '\t' + str(BCDict[BC][(sgRNA,label)]['UMI'])
            outline = outline + '\t' + str(BCDict[BC][(sgRNA,label)]['reads'])
            outfile.write(outline + '\n')
            
run()
