##################################
#                                #
# Last modified 2018/10/20       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import gzip

def run():

    if len(sys.argv) < 1:
        print 'usage: python %s inputfilename [-sortByAbundance]' % sys.argv[0]
        print '\tuse - instead of an input filename to indicate standard input' 
        print '\tthe script will print to stodut by default' 
        sys.exit(1)

    inputfilename = sys.argv[1]

    doSBA = False
    if '-sortByAbundance' in sys.argv:
        doSBA = True

    doStdInput = False
    if inputfilename == '-':
        doStdInput = True

    SeqDict = {}

    i=0
    pos=1
    if doStdInput:
        input_stream = sys.stdin
    else:
        if inputfilename.endswith('.gz'):
            input_stream = gzip.open(inputfilename)
        else:
            input_stream = open(inputfilename)
    for line in input_stream:
        previous=line
        if pos==1:
            if line.startswith('@'):
                pos=2
                continue
            else:
                print 'invalid read', line
                break
        if pos==2:
            i=i+1
            if i % 10000000 == 0:
                print str(i/1000000) + 'M reads processed'
            seq = line.strip()
            if SeqDict.has_key(seq):
                SeqDict[seq] += 1
            else:
                SeqDict[seq] = 1
            pos=3
            continue
        if pos==3 and line.startswith('+'):
            pos=4
            continue
        if pos==4:
            pos=1
            continue

    seqs = SeqDict.keys()

    NormFactor = i/1000000.

    if doSBA:
        seqCountsList = []
        for seq in seqs:
            seqCountsList.append((SeqDict[seq],seq))
        seqCountsList.sort()
        seqCountsList.reverse()
        for (counts,seq) in seqCountsList:
            outline = seq + '\t' + str(counts) + '\t' + str(counts/NormFactor)
            print outline
    else:
        seqs.sort()
        for seq in seqs:
            outline = seq + '\t' + str(SeqDict[seq]) + '\t' + str(counts/NormFactor)
            print outline

run()

