##################################
#                                #
# Last modified 03/25/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s  datafilename datafileFieldID bins outfileprefix [-skipzeros] [-bins 0,value1,value2....valueN]' % sys.argv[0]
        print '       Note: values will be sorted from large to small'
        sys.exit(1)

    datafilename = sys.argv[1]
    fieldID = int(sys.argv[2])
    bins = int(sys.argv[3])
    outfileprefix = sys.argv[4]

    skipZero=False
    if '-skipzeros' in sys.argv:
        skipZero=True

    doSpecificBins=False
    if '-bins' in sys.argv:
        doSpecificBins=True
        specificbins=[]
        fields=sys.argv[sys.argv.index('-bins')+1].split(',')
        for b in fields:
            specificbins.append(float(b))
        if skipZero:
            pass
        else:
            specificbins.append(0)
        specificbins=list(Set(specificbins))
        specificbins.sort()
        DataDict={}
        for b in specificbins:
            DataDict[b]=[]
        lineslist  = open(datafilename)
        t=0
        for line in lineslist:
            t+=1
            if t % 100000 == 0:
                print t, 'lines processed'
            if line[0]=='#':
                continue
            fields = line.strip().split('\t')
            score=float(fields[fieldID])
            if skipZero and score == 0:
                continue
            if score >= max(specificbins):
                DataDict[max(specificbins)].append(line)
                continue
            for i in range(len(specificbins)-1):
                if score >= specificbins[i] and score < specificbins[i+1]:
                    DataDict[specificbins[i]].append(line)
        for bin in specificbins:
            outfile = open(outfileprefix+'.' +str(bin), 'w')
            for line in DataDict[bin]:
                outfile.write(line)
            outfile.close()
    else:
        DataList=[]
        lineslist  = open(datafilename)
        t=0
        for line in lineslist:
            t+=1
            if t % 100000 == 0:
                print t, 'lines processed'
            if line[0]=='#':
                continue
            fields = line.strip().split('\t')
            FieldsList=[]
            score=float(fields[fieldID])
            if skipZero and score == 0:
                continue
            FieldsList.append(score)
            for i in range(len(fields)):
                FieldsList.append(fields[i])
            DataList.append(tuple(FieldsList))
        DataList.sort()
        DataList.reverse()
        NewDataList=[]
        for i in range(len(DataList)):
            NewDataList.append(list(DataList[i]))
        binSize=int(len(NewDataList)/(bins+0.0))
        outfile = open(outfileprefix+'.quartile_1', 'w')
        currentBin=1
        b=0
        for fields in NewDataList:
            b+=1
            outline=''
            for i in range(1,len(fields)):
                outline=outline+fields[i]+'\t'
            outfile.write(outline.strip()+'\n')
            if b % binSize == 0:
                outfile.close()
                currentBin+=1
                if currentBin <= bins:
                    outfile = open(outfileprefix+'.quartile_' + str(currentBin), 'w')
        outfile.close()
        
run()

