##################################
#                                #
# Last modified 2021/02/16       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import numpy
import gzip
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s datafilename 1|2 outfilename ' % sys.argv[0]
        print '\tuse - for input if you want to read from standard input' 
        print '\tthe input is supposed to be the output of PEFastqToTabDelimited.py' 
        sys.exit(1)

    datafilename = sys.argv[1]
    fieldID = int(sys.argv[2])
    outfilename = sys.argv[3]

    HistDict={}
    if datafilename == '-':
        lineslist  = sys.stdin
    else:
        if datafilename.endswith('.gz'):
            lineslist  = gzip.open(datafilename)
        else:
            lineslist  = open(datafilename)
    t=0
    for line in lineslist:
        t+=1
        if t % 1000000 == 0:
            print t, 'lines processed'
        if line[0]=='#':
            continue
        fields = line.strip().split('\t')
        BC = fields[0].split('1:N:0:')[1].split('+')[fieldID-1]
        if HistDict.has_key(BC):
            HistDict[BC] += 1
        else:
            HistDict[BC] = 1

    NewHistList = []
    for BC in HistDict.keys():
        NewHistList.append((HistDict[BC],BC))

    NewHistList.sort()
    NewHistList.reverse()

    outfile = open(outfilename, 'w')
    for (counts, BC) in NewHistList:
        outline = BC + '\t' + str(counts)
        outfile.write(outline + '\n')
    outfile.close()
        
run()

