##################################
#                                #
# Last modified 11/02/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s <list-of-geneIDs> <gtf> <FPKM table> <transcriptID field ID> <FPKM threshold> <FPKM fields> <outfilename>' % sys.argv[0]
        print '       fields and thresholds comma-separated'
        sys.exit(1)

    genes = sys.argv[1]
    gtf = sys.argv[2]
    expr = sys.argv[3]
    IDfield = int(sys.argv[4])
    minFPKM = float(sys.argv[5])
    FPKMfields =[]
    fields = sys.argv[6].split(',')
    for v in fields:
        FPKMfields.append(int(v))
    outfilename = sys.argv[7]

    linelist = open(genes)
    GeneDict={}  
    TranscriptToGeneDict={}
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        geneID=fields[0]
        GeneDict[geneID]={}

    linelist = open(gtf)
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
        geneID=fields[8].split('gene_id "')[1].split('"')[0]
        if GeneDict.has_key(geneID):
            pass
        else:
            continue
        GeneDict[geneID][transcriptID]={}
        GeneDict[geneID][transcriptID]
        GeneDict[geneID][transcriptID]['expression']={}
        GeneDict[geneID][transcriptID]['major']={}
        TranscriptToGeneDict[transcriptID]=geneID

    print len(GeneDict.keys())

    IDtoLableDict={}
    linelist = open(expr)
    for line in linelist:
        fields=line.strip().split('\t')
        if line.startswith('#'):
            for ID in FPKMfields:
                IDtoLableDict[ID]=fields[ID]
            continue
        if line.startswith('tracking_id'):
            continue
        transcriptID=fields[IDfield]
        if TranscriptToGeneDict.has_key(transcriptID):
            geneID=TranscriptToGeneDict[transcriptID]
        else:
            continue
        for ID in FPKMfields:
            if 'FAIL' in fields[ID]:
                GeneDict[geneID][transcriptID]['expression'][ID]='FAIL'
            else:
                GeneDict[geneID][transcriptID]['expression'][ID] = float(fields[ID].split(',')[0])

    MajorIsoformsDict={}

    outfile=open(outfilename,'w')

    outline = '#Label\t'
    for ID in FPKMfields:
        label = IDtoLableDict[ID]
        outline = outline + label +'\t'
        MajorIsoformsDict[IDtoLableDict[ID]]={}
        for ID2 in FPKMfields:
            MajorIsoformsDict[IDtoLableDict[ID]][IDtoLableDict[ID2]]=0
    outfile.write(outline.strip() + '\n')

    i=0
    for geneID in GeneDict.keys():
        i+=1
        if i % 1000 == 0:
            print str(i)
        outline = geneID
        MajIsoDict={}
        for ID in FPKMfields:
            FPKM=0
            MIFPKM=0
            MajIsoDict[ID]=''
            for transcriptID in GeneDict[geneID].keys():
                if len(GeneDict[geneID][transcriptID]['expression']) == 0:
                    continue
                if GeneDict[geneID][transcriptID]['expression'][ID]=='FAIL':
                    FPKM=0
                    break
                else:
                    FPKM+=GeneDict[geneID][transcriptID]['expression'][ID]
                    if GeneDict[geneID][transcriptID]['expression'][ID] > MIFPKM:
                        MIFPKM=GeneDict[geneID][transcriptID]['expression'][ID]
                        MajIsoDict[ID] = transcriptID
            if FPKM < minFPKM:
                MajIsoDict[ID] = ''
        for ID1 in FPKMfields:
#            print ID1,
            if len(MajIsoDict[ID1]) > 0:
                for ID2 in FPKMfields:
#                    print ID2
                    if len(MajIsoDict[ID2]) > 0:
                        if MajIsoDict[ID1] != MajIsoDict[ID2]:
#                            print MajIsoDict[ID1],len(MajIsoDict[ID1]),MajIsoDict[ID2],len(MajIsoDict[ID2]),TranscriptToGeneDict[MajIsoDict[ID1]],TranscriptToGeneDict[MajIsoDict[ID2]]
                            MajorIsoformsDict[IDtoLableDict[ID1]][IDtoLableDict[ID2]]+=1
                            MajorIsoformsDict[IDtoLableDict[ID2]][IDtoLableDict[ID1]]+=1

    for ID1 in FPKMfields:
        label = IDtoLableDict[ID1]
        outline = label +'\t'
        for ID2 in FPKMfields:
            outline = outline + str(MajorIsoformsDict[IDtoLableDict[ID1]][IDtoLableDict[ID2]]) + '\t'
        outfile.write(outline.strip() + '\n')
    
    outfile.close()
	
run()
