##################################
#                                #
# Last modified 2017/06/01       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import gc
import math
import string
from sets import Set
import numpy as np
from scipy.stats import hypergeom

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s table scoreFieldID minScore maxScore binaryFields outfilename [-log10]' % sys.argv[0]
        print '\tbinaryFields format: some combination of comma-separated and start:end'
        sys.exit(1)

    table = sys.argv[1]
    scoreID = int(sys.argv[2])
    minScore = float(sys.argv[3])
    maxScore = float(sys.argv[4])
    outfilename = sys.argv[6]

    doLog10 = False
    if '-log10' in sys.argv:
        doLog10 = True

    BinaryFields = []
    fields = sys.argv[5].split(',')
    for ID in fields:
        if ':' in ID:
            start = int(ID.split(':')[0])
            end = int(ID.split(':')[1])
            for i in range(start,end+1):
                BinaryFields.append(i)
        else:
            BinaryFields.append(int(ID))
    BinaryFields.sort()

    DataDict = {}
    BinaryDict = {}
    k=0
    SubSet = 0
    linelist = open(table)
    ii = 0
    LabelDict = {}
    for line in linelist:
        ii += 1
        if ii % 10000 == 0:
            print ii
        fields = line.strip().split('\t')
        if line.startswith('#'):
            for ID in BinaryFields:
                LabelDict[ID] = fields[ID]
                BinaryDict[ID] = {}
                BinaryDict[ID]['total'] = 0
                BinaryDict[ID]['common'] = 0
            continue
        score = float(fields[scoreID])
        InSubSet = False
        if score >= minScore and score <= maxScore:
            SubSet += 1
            InSubSet = True
        DataDict[k] = {}
        DataDict[k]['score'] = score
        DataDict[k]['B'] = {}
        for ID in BinaryFields:
            BS = int(fields[ID])
            DataDict[k]['B'][ID] = BS
            if BS == 1:
                BinaryDict[ID]['total'] += 1
            if InSubSet and BS == 1:
                BinaryDict[ID]['common'] += 1
        k+=1

    print 'finished parsing data'

    outfile = open(outfilename,'w')

    outline = '#label\tTotalElements\tInSubSet\tInLabel\tCommon\tp-val'
    outfile.write(outline + '\n')

    for ID in BinaryFields:
        BS = BinaryDict[ID]['total']
        Common = BinaryDict[ID]['common']
        rv = hypergeom(k, BS, SubSet)
        p = 1 - rv.cdf(Common)
        if Common == 0:
            p = 1
        p = max(p,1e-300)
        if doLog10:
            p = -math.log10(p)
        outline = LabelDict[ID] + '\t' + str(k)  + '\t' + str(SubSet) + '\t' + str(BS) + '\t' + str(Common) + '\t' + str(p)
        outfile.write(outline + '\n')

    outfile.close()

run()

