##################################
#                                #
# Last modified 10/26/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s  input fieldID_counts [(0),number1,number2,number3...,numberN] fieldID_connectivity fieldID_novelty outfilename' % sys.argv[0]
        print '       input file format: field[7] and fields[8] - novel/known and connectivity, fields[11] - counts' 
        sys.exit(1)

    datafilename = sys.argv[1]
    outfilename = sys.argv[6]
    bins=sys.argv[3]
    bins=bins.split(',')
    binList=[]
    print bins
    for bin in bins:
        binList.append(float(bin))
    binList.append(0.0)
    binList=list(Set(binList))
    binList.sort()

    fieldIDCounts = int(sys.argv[2])
    fieldIDConnectivity = int(sys.argv[4])
    fieldIDNovelty = int(sys.argv[5])

    HistDict={}

    outfile = open(outfilename, 'w')

    DataList=[]
    lineslist  = open(datafilename)
    t=0
    for line in lineslist:
        t+=1
        if t % 1000000 == 0:
            print t, 'lines processed'
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        counts=float(fields[fieldIDCounts])
        type=(fields[fieldIDNovelty],fields[fieldIDConnectivity])
        DataList.append((counts,type))
        HistDict[type]={}

    for type in HistDict.keys():
        for v1 in binList:
             HistDict[type][v1]=0

    for (v1,y) in DataList:
        if v1 > max(binList):
            x=max(binList)
        else:
            for i in range(len(binList)-1):
                if v1 >= binList[i] and v1 <= binList[i+1]:
                    x=binList[i]
                    break
        HistDict[y][x]+=1

    outline='#novelty\t\ype'
    for x in binList:
        outline=outline+'\t'+str(x)
    outfile.write(outline+'\n')
    
    keys=HistDict.keys()
    keys.sort()
    for y in keys:
        outline=y[0]+'\t'+y[1]
        for x in binList:
            outline=outline+'\t'+str(HistDict[y][x])
        outfile.write(outline+'\n')

    outfile.close()
        
run()

