##################################
#                                #
# Last modified 08/28/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s tracking tmap min_low_confidence outfilename [-datasetsInCellLine file]' % sys.argv[0]
        print '	-datasetsInCellLine file format: <name> <tab> <number of datasets>'
        print '	Note: the name has to be unique and it has to be present in the tracking file annotation>'
        sys.exit(1)

    tracking = sys.argv[1]
    tmap = sys.argv[2]
    conf=float(sys.argv[3])
    outputfilename = sys.argv[4]

    doMinDatasets=False
    if '-datasetsInCellLine' in sys.argv:
        doMinDatasets=True
        DatasetsCellDict={}
        listoflines=open(sys.argv[sys.argv.index('-datasetsInCellLine')+1])
        for line in listoflines:
            fields=line.strip().split('\t')
            DatasetsCellDict[fields[0].strip()]=int(fields[1])
            print fields[0], fields[1]

    outfile=open(outputfilename,'w')
    TranscriptToTypeDict={}
    GeneToTypeDict={}
    listoflines = open(tmap)
    for line in listoflines:
        fields=line.strip().split('\t')
        type=fields[2] 
        transcript=fields[4] 
        gene=fields[3] 
        TranscriptToTypeDict[transcript]=type
        GeneToTypeDict[gene]=type
    
    listoflines = open(tracking)
    presenceDict={}
    problematic=0
    o=1
    for line in listoflines:
        o+=1
        if o % 100000 == 0:
            print o
        fields=line.strip().split('\t')
        transcript=fields[0]
        if TranscriptToTypeDict.has_key(transcript):
            type=TranscriptToTypeDict[transcript]
        else:
            gene=fields[1]
            if GeneToTypeDict.has_key(gene):
                type=GeneToTypeDict[gene]
            else:
                type=fields[3]
                if type != 'u' and type != 'j':
                    problematic+=1
        if presenceDict.has_key(type):
            pass
        else:
            presenceDict[type]={}
            number=len(fields)-3
            if doMinDatasets:
                for i in range(len(DatasetsCellDict.keys())+1):
                    presenceDict[type][i]=0
            else:
                for i in range(number):
                    presenceDict[type][i]=0
        present=0
        if doMinDatasets:
            CellTypePresenceDict={}
            for field in fields[4:len(fields)]:
                if field=='-':
                    pass
                if field.startswith('q'):
                    for CellType in DatasetsCellDict.keys():
                        if CellType in field:
                            if CellTypePresenceDict.has_key(CellType):
                                pass
                            else:
                                CellTypePresenceDict[CellType]=0
                            break
                    LoConfFPKM=float(field.split('|')[4])
                    if LoConfFPKM >= conf:
                        try:
                            CellTypePresenceDict[CellType]+=1
                        except:
#                             print 'problem', field
                            pass
            for CellType in CellTypePresenceDict.keys():
                if CellTypePresenceDict[CellType]==DatasetsCellDict[CellType]:
                    present+=1
        else:
            for field in fields[4:len(fields)]:
                if field=='-':
                    pass
                if field.startswith('q'):
                    LoConfFPKM=float(field.split('|')[4])
                    if LoConfFPKM >= conf:
                        present+=1
        presenceDict[type][present]+=1

    print 'problematic (unknown type and no match in tmap)', problematic
    print presenceDict
 
    print presenceDict

    types=presenceDict.keys()
    types.sort()
    outline='#'
    if doMinDatasets:
        for i in range(len(DatasetsCellDict.keys())):
            outline=outline+'\t'+str(i)
    else:
        for i in range(number):
            outline=outline+'\t'+str(i)
    outfile.write(outline+'\n')
    for type in types:
        outline=type
        keys=presenceDict[type].keys()
        keys.sort()
        for presence in keys:
            outline=outline+'\t'+str(presenceDict[type][presence])
        outfile.write(outline + '\n')

    outfile.close()

run()

