##################################
#                                #
# Last modified 2019/01/05       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import gzip
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s  datafilename IDs [bins | "string"| outfilename [-cufflinksStatus]' % sys.argv[0]
        print 'format of IDs: either comma separated or start-end (including end)'
        print 'format of bins: either comma-separated values or start:end:binzise'
        print '\tuse - for input if you want to read from standard input' 
        sys.exit(1)

    doCS=False
    if '-cufflinksStatus' in sys.argv:
        doCS=True
        print 'will discard FAIL status entries'

    datafilename = sys.argv[1]
    IDfields=[]
    if '-' in sys.argv[2]:
        fields1=int(sys.argv[2].split('-')[0])
        fields2=int(sys.argv[2].split('-')[1])
        for ID in range(fields1,fields2+1):
            IDfields.append(ID)
    else:
        fields = sys.argv[2].split(',')
        for ID in fields:
            IDfields.append(int(ID))
    LabelDict={}
    DataDict={}
    outfilename = sys.argv[4]

    doSpecificBins=True
    bins=sys.argv[3]
    binList=[]
    doStringBins = False
    if bins == 'string':
        doStringBins=True
        print 'will treat data as strings'
    else:
        if ':' in sys.argv[3]:
            start = float(sys.argv[3].split(':')[0])
            end = float(sys.argv[3].split(':')[1])
            step = float(sys.argv[3].split(':')[2])
            print start,end,step
            bin = start
            binList.append(bin)
            while bin <= end:
                bin += step
                binList.append(bin)
            binList.append(start)
            binList.append(end)
        else:
            bins=bins.split(',')
            for bin in bins:
                binList.append(float(bin))
            binList.append(0)
        binList=list(Set(binList))

    binList.sort()

    print binList

    outfile = open(outfilename, 'w')

    HistDict={}
    if datafilename == '-':
        lineslist  = sys.stdin
    elif datafilename.endswith('.gz'):
        lineslist  = gzip.open(datafilename)
    else:
        lineslist  = open(datafilename)
    i=0
    for line in lineslist:
        i+=1
        if i % 100000 == 0:
            print i
        if line[0]=='#':
            fields=line.split('\n')[0].split('\t')
            for ID in IDfields:
                LabelDict[ID]=fields[ID]
                print ID, fields[ID]
                DataDict[fields[ID]]=[]
            continue
        fields = line.strip().split('\t')
        for ID in IDfields:
            if len(fields) <= ID:
                continue
            if fields[ID] == '':
                continue
            if doStringBins:
                binList.append(fields[ID])
                DataDict[LabelDict[ID]].append(fields[ID])
            else:
                if fields[ID]=='NaN':
                    DataDict[LabelDict[ID]].append(0)
                else:
                    if doCS:
                        (value,status) = fields[ID].split(',')
                        if status != 'FAIL':
                            DataDict[LabelDict[ID]].append(value)
                        else:
                            DataDict[LabelDict[ID]].append(0)
                    else:
                        DataDict[LabelDict[ID]].append(fields[ID].split(',')[0])

    binList = list(Set(binList))
    binList.sort()

    IDkeys=DataDict.keys()
    IDkeys.sort()
    HistDict={}
    for ID in IDkeys:
        HistDict[ID]={}
        for bin in binList:
            HistDict[ID][bin]=0
        

    binList.sort()   
    for ID in IDkeys:
        print ID, '....', ID
        if doStringBins:
            for value in DataDict[ID]:
                HistDict[ID][value]+=1
        else:
            for value in DataDict[ID]:
                try:
                    num=float(value)
                    if num >= binList[-1]:
                        HistDict[ID][binList[-1]]+=1
                    else:
                        for bin in binList[0:-1]:
                            if num >=bin and num < binList[binList.index(bin)+1]:
                                HistDict[ID][bin]+=1
                                continue
                except:
                    print 'problem with', ID, value

    print IDfields
    print IDkeys

    outline='#'
    for bin in binList:
        outline=outline+'\t'+str(bin)
    outfile.write(outline+'\n')
    for ID in IDkeys:
        outline=ID
        for bin in binList:
            outline=outline+'\t'+str(HistDict[ID][bin])
        outfile.write(outline+'\n')
        
    outfile.close()
        
run()

