##################################
#                                #
# Last modified 07/02/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import scipy.stats

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s ASE_file columnID p-value outfile' % sys.argv[0]
        print '\tAssumed format: #geneName	geneID	chr	C57BL_collapsed_reads	CAST_collapsed_reads	C57BL_fraction	CAST_fraction	_pvalue'
        print '\t columnID referse to which fraction to be outputted'
        sys.exit(1)

    ASE=sys.argv[1]
    fieldID=int(sys.argv[2])
    pvalue=float(sys.argv[3])
    outfilename = sys.argv[4]

    minReadNumber=0
    p = 1
    while p > pvalue:
        minReadNumber+=1
        p = scipy.stats.binom_test(0, minReadNumber, 0.5)
    print 'will only consider genes with more than', minReadNumber, 'allele-specific reads'

    chrDict={}

    outfile = open(outfilename, 'w')

    linelist=open(ASE)
    for line in linelist:
        fields=line.strip().split('\t')
        if line.startswith('#gene'):
            continue
        chr=fields[2]
        if chrDict.has_key(chr):
            pass
        else:
            chrDict[chr]=[]
        reads = int(fields[3]) + int(fields[4])
        if reads >= minReadNumber:
            chrDict[chr].append(fields[fieldID])

    keys=chrDict.keys()
    keys.sort()

    maxEntries=0
    outline = ''
    for chr in keys:
        outline=outline+chr+'\t'
        if len(chrDict[chr]) > maxEntries:
            maxEntries=len(chrDict[chr])
    outfile.write(outline.strip() +'\n')

    for i in range(maxEntries):
        outline=''
        for chr in keys:
            if len(chrDict[chr]) < maxEntries and i >= len(chrDict[chr]):
                outline=outline+'\t'
            else:
                outline=outline+chrDict[chr][i]+'\t'
        outfile.write(outline.strip() +'\n')

    outfile.close()

run()