##################################
#                                #
# Last modified 07/19/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import scipy.stats
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s bowtie_mapping outfile' % sys.argv[0]
        print '     The scrupt assumes that reads have been mapped with 0 mismatches and multiplicity of 1 against a bowtie index derived from the output of makeHetTranscriptome.py' 
        sys.exit(1)

    bowtie=sys.argv[1]
    outfilename = sys.argv[2]

    GeneDict={}

    parentsDict={}

    linelist=open(bowtie)
    i=0
    for line in linelist:
        i+=1
        if i % 1000000 == 0:
            print i, 'lines processed'
        fields=line.strip().split('\t')
        if fields[6]!='0':
            continue
        if len(fields[2].split('::')) <=2:
            continue
        gene=fields[2].split('::')[0]
        if GeneDict.has_key(gene):
            pass
        else:
            GeneDict[gene]={}
        parent=fields[2].split('::')[-1]
        if GeneDict[gene].has_key(parent):
            pass
        else:
            GeneDict[gene][parent]=[]
        GeneDict[gene][parent].append((fields[1],fields[3]))
        chr=fields[2].split('::')[1].split(':')[0]
        left=fields[2].split('::')[1].split(':')[1].split('-')[0]
        right=fields[2].split('::')[1].split(':')[1].split('-')[1][0:-1]
        strand=fields[2].split('::')[1][-1]
        GeneDict[gene]['coordinates']=(chr,left,right,strand)
        if parentsDict.has_key(parent):
            pass
        else:
            parentsDict[parent]=''
      
    outfile = open(outfilename, 'w')

    keys=GeneDict.keys()
    keys.sort()

    parents=parentsDict.keys()
    if len(parents) > 2:
        print 'more than 2 parents detected, exiting'
        print parents
        sys.exit(1)
    parents.sort()
    parent1 = parents[0]
    parent2 = parents[1]

    print parents

    outline = '#gene\tchr\tleft\tright\tstrand\t' + parent1 + '_collapsed_reads' + '\t' + parent2 + '_collapsed_reads' + '\t' + parent1 + '_fraction' + '\t' + parent2 + '_fraction' + '\t' + '_pvalue'
    outfile.write(outline +'\n')

    for gene in keys:
        (chr,left,right,strand)=GeneDict[gene]['coordinates']
        if GeneDict[gene].has_key(parent1):
            parent1Counts = len(list(Set(GeneDict[gene][parent1])))
        else:
            parent1Counts = 0
        if GeneDict[gene].has_key(parent2):
            parent2Counts = len(list(Set(GeneDict[gene][parent2])))
        else:
            parent2Counts = 0
        pvalue =  scipy.stats.binom_test(parent1Counts, parent2Counts + parent1Counts, 0.5)
        outline = gene + '\t' + chr + '\t' + left + '\t' + right + '\t' + strand + '\t' + str(parent1Counts) + '\t' + str(parent2Counts) + '\t' + str(parent1Counts/(parent1Counts + parent2Counts + 0.0)) + '\t' + str(parent2Counts/(parent1Counts + parent2Counts + 0.0)) + '\t' + str(pvalue)
        outfile.write(outline +'\n')

    outfile.close()

run()