##################################
#                                #
# Last modified 12/14/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s expression1 expression2 Name_fieldID FPKM_fieldID FPKM_lo_fieldID thresholds outfilename ' % sys.argv[0]
        print 'expression file should have isoform names in the GENCODE naming scheme' 
        sys.exit(1)

    FPKM1 = sys.argv[1]
    FPKM2 = sys.argv[2]
    NameID = int(sys.argv[3])
    FPKMID = int(sys.argv[4])
    FPKM_lo_ID = int(sys.argv[5])
    thresholds = sys.argv[6]
    ThresholdList1=thresholds.split(',')
    ThresholdList=[]
    for threshold in ThresholdList1:
        ThresholdList.append(float(threshold))
    ThresholdList.sort()
    outputfilename = sys.argv[7]

    outfile = open(outputfilename, 'w')

    ExpressionDict={}
    lineslist = open(FPKM1)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        ID=fields[NameID]     
        geneName_fields=ID.split('-')
        if len(geneName_fields)==2:
            geneName=geneName_fields[0]
        else:
            geneName=''
            for p in range(len(geneName_fields)-1):
                geneName=geneName+'-'+geneName_fields[p]
            geneName=geneName[1:len(geneName)]
        FPKM=float(fields[FPKMID])
        FPKM_lo=float(fields[FPKM_lo_ID])
        if ExpressionDict.has_key(geneName):
            pass
        else:
            ExpressionDict[geneName]={}
        if ExpressionDict[geneName].has_key(ID):
            pass
        else:
            ExpressionDict[geneName][ID]={}
        ExpressionDict[geneName][ID]['rep1']=(FPKM,FPKM_lo)

    lineslist = open(FPKM2)
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        ID=fields[NameID]     
        geneName_fields=ID.split('-')
        if len(geneName_fields)==2:
            geneName=geneName_fields[0]
        else:
            geneName=''
            for p in range(len(geneName_fields)-1):
                geneName=geneName+'-'+geneName_fields[p]
            geneName=geneName[1:len(geneName)]
        FPKM=float(fields[FPKMID])
        FPKM_lo=float(fields[FPKM_lo_ID])
        ExpressionDict[geneName][ID]['rep2']=(FPKM,FPKM_lo)

    OutputDict={}
    OutputGeneDict={}
    for threshold in ThresholdList:
        OutputDict[threshold]={}
        OutputDict[threshold]['FPKM']={}
        OutputDict[threshold]['FPKM_lo']={}
        for geneName in ExpressionDict.keys():
            if OutputGeneDict.has_key(geneName):
                pass
            else:
                OutputGeneDict[geneName]={}
            FPKM_passing=0
            FPKM_lo_passing=0
            for ID in ExpressionDict[geneName].keys():
                if threshold==0:
                    if ExpressionDict[geneName][ID]['rep2'][0] > 0 or ExpressionDict[geneName][ID]['rep1'][0] > 0:
                        FPKM_passing+=1
                    if ExpressionDict[geneName][ID]['rep2'][1] > 0 or ExpressionDict[geneName][ID]['rep1'][1] > 0:
                        FPKM_lo_passing+=1
                else:
                    if ExpressionDict[geneName][ID]['rep2'][0] >=threshold and ExpressionDict[geneName][ID]['rep1'][0] >= threshold: 
                        FPKM_passing+=1
                    if ExpressionDict[geneName][ID]['rep2'][1] >=threshold and ExpressionDict[geneName][ID]['rep1'][1] >= threshold: 
                        FPKM_lo_passing+=1
            if OutputDict[threshold]['FPKM'].has_key(FPKM_passing):
                OutputDict[threshold]['FPKM'][FPKM_passing]+=1
            else:
                OutputDict[threshold]['FPKM'][FPKM_passing]=1
            if OutputDict[threshold]['FPKM_lo'].has_key(FPKM_lo_passing):
                OutputDict[threshold]['FPKM_lo'][FPKM_lo_passing]+=1
            else:
                OutputDict[threshold]['FPKM_lo'][FPKM_lo_passing]=1
            OutputGeneDict[geneName][threshold]=(FPKM_passing,FPKM_lo_passing)

    outline='threshold'
    for i in range(0,15):
        outline=outline+'\t'+str(i)
    for i in range(0,15):
        outline=outline+'\t'+str(i)
    outfile.write(outline+'\n')
    for i in ThresholdList:
        outline=str(i)
        for FPKM_passing in range(0,15):
            if OutputDict[i]['FPKM'].has_key(FPKM_passing):
                outline=outline+'\t'+str(OutputDict[i]['FPKM'][FPKM_passing])
            else:
                outline=outline+'\t0'
        for FPKM_lo_passing in range(0,15):
            if OutputDict[i]['FPKM_lo'].has_key(FPKM_lo_passing):
                outline=outline+'\t'+str(OutputDict[i]['FPKM_lo'][FPKM_lo_passing])
            else:
                outline=outline+'\t0'
        outfile.write(outline+'\n')

    outfile.write('\n')
    outfile.write('\n')
    outfile.write('\n')
    outline='#threshold:'
    for threshold in ThresholdList:
        outline=outline+'\t'+str(threshold)
    outfile.write(outline+'\n')
    keys=OutputGeneDict.keys()
    keys.sort()
    for geneName in keys:
        outline=geneName
        for threshold in ThresholdList:
            outline=outline+'\t'+str(OutputGeneDict[geneName][threshold][0])+','+str(OutputGeneDict[geneName][threshold][1])
        outfile.write(outline+'\n')
    outfile.close()

run()

