##################################
#                                #
# Last modified 2017/10/19       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s contig_fasta length_bins output' % sys.argv[0]
        print '     lenght bins comma spearated'
        sys.exit(1)

    fasta = sys.argv[1]

    bins=sys.argv[2]
    bins=bins.split(',')
    binList=[]
    print bins
    for bin in bins:
        binList.append(int(bin))
    binList.append(0)
    binList = list(Set(binList))
    binList.sort()
    HistDict={}
    for bin in binList:
        HistDict[bin]={}
        HistDict[bin]['NC']=0
        HistDict[bin]['NBP']=0

    input_stream = open(fasta)
    sequenceList = []
    for line in input_stream:
        if line.startswith('>'):
            if len(sequenceList) > 0:
                sequence = ''.join(sequenceList)
                sequenceList=[]
                length = len(sequence)
                if length > max(binList):
                    HistDict[bin]['NC'] += 1
                    HistDict[bin]['NBP'] += length
                    continue
                for bin in binList:
                    if length >= bin and length < binList[binList.index(bin)+1]:
                        HistDict[bin]['NC'] += 1
                        HistDict[bin]['NBP'] += length
                        continue
            else:
                continue
        else:
            sequenceList.append(line.strip())

    TotalBP = 0.0

    for bin in HistDict.keys():
        TotalBP += HistDict[bin]['NBP']

    outfile = open(sys.argv[3],'w')
    outfile.write('bin\tNumberContigs\tTotalNumberBasePairs\tFractionOfTotalSequence\n')
    for bin in binList:
        if bin == 0:
            outline = '<' + str(binList[binList.index(bin)+1]) + '\t' + str(HistDict[bin]['NC'])  + '\t' + str(HistDict[bin]['NBP'])  + '\t' + str(HistDict[bin]['NBP']/TotalBP)
        else:
            outline = str(bin) + '\t' + str(HistDict[bin]['NC'])  + '\t' + str(HistDict[bin]['NBP'])  + '\t' + str(HistDict[bin]['NBP']/TotalBP)
        outfile.write(outline + '\n')
    outfile.close()

run()

