##################################
#                                #
# Last modified 11/22/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s <gtf (with class codes)> <FPKM table> <transcriptID field ID> <FPKM thresholds> <FPKM fields> <outfilename> [-status]' % sys.argv[0]
        print '       fields and thresholds comma-separated'
        print '       use the -status option if the Cufflinks status is added to FPKM values'
        sys.exit(1)

    gtf = sys.argv[1]
    expr = sys.argv[2]
    IDfield = int(sys.argv[3])
    FPKMthresholds =[]
    fields = sys.argv[4].split(',')
    for v in fields:
        FPKMthresholds.append(float(v))
    FPKMfields =[]
    fields = sys.argv[5].split(',')
    for v in fields:
        FPKMfields.append(int(v))
    outfilename = sys.argv[6]

    print FPKMthresholds
    print FPKMfields

    doStatus=False
    if '-status' in sys.argv:
        doStatus=True

    linelist = open(gtf)
    GeneDict={}  
    TranscriptToGeneDict={}
    for line in linelist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        transcriptID=fields[8].split('transcript_id "')[1].split('"')[0]
        geneID=fields[8].split('gene_id "')[1].split('"')[0]
        if 'class_code' in fields[8]:
            pass
        else:
            if geneID.startswith('ENS'):
                pass
            else:
                continue
        if 'class_code' in fields[8]:
            class_code = fields[8].split('class_code "')[1].split('"')[0]
        else:
            class_code = '='
        if GeneDict.has_key(geneID):
            pass
        else:
            GeneDict[geneID]={}
        GeneDict[geneID][transcriptID]={}
        GeneDict[geneID][transcriptID]['class_code']=class_code
        GeneDict[geneID][transcriptID]['expression']={}
        TranscriptToGeneDict[transcriptID]=geneID

    linelist = open(expr)
    for line in linelist:
        fields=line.strip().split('\t')
        if line.startswith('#'):
            continue
        if line.startswith('tracking_id'):
            continue
        transcriptID=fields[IDfield]
        geneID=TranscriptToGeneDict[transcriptID]
        for ID in FPKMfields:
            if doStatus:
                (value,status) = tuple(fields[ID].split(','))
                if status == 'FAIL':
                    GeneDict[geneID][transcriptID]['expression'][ID]=status
                else:
                    GeneDict[geneID][transcriptID]['expression'][ID]=float(value)
            else:
                value = fields[ID]
                status='OK'
                GeneDict[geneID][transcriptID]['expression'][ID]=float(value)

    outfile=open(outfilename,'w')

    outline = '#GeneID\t'
    for FPKM in FPKMthresholds:
        outline = outline + 'known > ' + str(FPKM) + '\t'
    for FPKM in FPKMthresholds:
        outline = outline + 'novel > ' + str(FPKM) + '\t'
    for FPKM in FPKMthresholds:
        outline = outline + 'number_major_isoforms > ' + str(FPKM) + '\t'
    outfile.write(outline+'\n')

    i=0
    p=0
    for geneID in GeneDict.keys():
        i+=1
        if i % 1000 == 0:
            print str(i)
        outline = geneID
        CountsDictNovel={}
        CountsDictKnown={}
        Skip=False
        for FPKM in FPKMthresholds:
            CountsDictNovel[FPKM]=0
            CountsDictKnown[FPKM]=0
            for transcriptID in GeneDict[geneID].keys():
                try:
                    for ID in FPKMfields:
                        if GeneDict[geneID][transcriptID]['expression'][ID] != 'FAIL' and GeneDict[geneID][transcriptID]['expression'][ID] > FPKM:
                            if GeneDict[geneID][transcriptID]['class_code'] == 'j':
                                CountsDictNovel[FPKM]+=1
                            if GeneDict[geneID][transcriptID]['class_code'] == '=':
                                CountsDictKnown[FPKM]+=1
                            break
                except:
                    p+=1
                    print p, 'problem with', transcriptID, 'skipping'
                    del GeneDict[geneID][transcriptID]
                    Skip=True
        if Skip:
            continue
        for FPKM in FPKMthresholds:
            outline = outline +'\t' + str(CountsDictKnown[FPKM])
        for FPKM in FPKMthresholds:
            outline = outline +'\t' + str(CountsDictNovel[FPKM])
        MajorIsoforms=[]
        for ID in FPKMfields:
            MIFPKM=0
            MajorIsoform=''
            for transcriptID in GeneDict[geneID].keys():
                if GeneDict[geneID][transcriptID]['expression'][ID] != 'FAIL' and GeneDict[geneID][transcriptID]['expression'][ID] > MIFPKM:
                    MIFPKM=GeneDict[geneID][transcriptID]['expression'][ID]
                    MajorIsoform = transcriptID
            if MajorIsoform != '':
                MajorIsoforms.append((MIFPKM,MajorIsoform))
        for FPKM in FPKMthresholds:
            MajorIsoformsFPKM=[]
            for (MIFPKM,MajorIsoform) in MajorIsoforms:
                if MIFPKM >= FPKM:
                    MajorIsoformsFPKM.append(MajorIsoform)
            MajorIsoformsFPKM = list(Set(MajorIsoformsFPKM))
            outline = outline +'\t' + str(len(MajorIsoformsFPKM))
        outfile.write(outline + '\n')

    outfile.close()
	
run()
