##################################
#                                #
# Last modified 04/04/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
from sets import Set

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s <list of files filename (name,path)> <threshold> <gtf> <outfilename> [-chr name] ' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    threshold = float(sys.argv[2])
    gtf = sys.argv[3]
    outputfilename = sys.argv[4]
    doChr=False
    TheChr=''
    if '-chr' in sys.argv:
        doChr=True
        TheChr=sys.argv[sys.argv.index('-chr')+1]
        print 'Will only consider ', TheChr

    listoflines = open(inputfilename)
    datasetDict={}
    CumCoverageDict={}
    for line in listoflines:
        fields=line.strip().split(',')
        if len(fields)<2:
            continue
        datasetDict[fields[0]]=fields[1]
        CumCoverageDict[fields[0]]=0

    datasets=datasetDict.keys()
    datasets.sort()
    GenomeCoverageDict={}

    p=len(datasets)
    for dataset in datasets:
        p=p-1
        print p, dataset, datasetDict[dataset]
        i=0
        listoflines = open(datasetDict[dataset])
        for line in listoflines:
            if line[0:3]!='chr':
               continue
            fields=line.strip().split('\t')
            if len(fields)==1:
                fields=fields[0].split(' ')
            chr=fields[0]
            i+=1
            if doChr and chr!=TheChr:
                continue
            if GenomeCoverageDict.has_key(chr):
                pass
            else:
                GenomeCoverageDict[chr]={}
            if i % 100000 == 0:
                print p, dataset, i, 'lines processed'
            try:
                start=int(fields[1])
            except:
                continue
            stop=int(fields[2])
            score=int(fields[3].split('.')[0])
            if score < threshold or score==0:
                continue
            for j in range(start,stop):
                if GenomeCoverageDict[chr].has_key(j):
                    GenomeCoverageDict[chr][j]+=score
                else:
                    GenomeCoverageDict[chr][j]=score
        for chr in GenomeCoverageDict.keys():
            for j in GenomeCoverageDict[chr].keys():
                if GenomeCoverageDict[chr][j] >= threshold:
                    CumCoverageDict[dataset]+=1

    print GenomeCoverageDict.keys()

    BTPosDict={}
    BTDict={}
    BTRevDict={}
    BTCount=0
    BTLenDict={}
    DepthDict={}
    DepthDict['Genome']={}
    IntronicDict={}
    BTLenDict['Intronic']=0
    DepthDict['Unannotated']={}
    DepthDict['Intronic']={}

    print 'Inputing annotation'
    print 'parsing introns'
    print 'summarizing depth stats'
    c1=0
    c2=0
    chrkeys=GenomeCoverageDict.keys()
    chrkeys.sort()
    for chr in chrkeys:
        print chr
        listoflines = open(gtf)
        for line in listoflines:
            fields=line.strip().split('\t')
            if fields[0]!=chr:
                continue
            start=int(fields[3])
            stop=int(fields[4])
            if fields[2]=='transcript':
                for i in range(start,stop):
                    IntronicDict[chr][i]=1
                continue
            if IntronicDict.has_key(chr):
                pass
            else:
                IntronicDict[chr]={}
            if BTPosDict.has_key(chr):
                pass
            else:
                BTPosDict[chr]={}
            if fields[2]!='exon':
                continue
            BT=fields[8].split('transcript_type "')[1].split('"')[0]
            if BTDict.has_key(BT):
                pass
            else:
                BTDict[BT]=BTCount
                BTRevDict[BTCount]=BT
                DepthDict[BT]={}
                BTLenDict[BT]=0
                BTCount+=1
            for i in range(start,stop):
                IntronicDict[chr][i]=0
                if BTPosDict[chr].has_key(i):
                    if BTDict[BT] not in BTPosDict[chr][i]:
                        BTPosDict[chr][i].append(BTDict[BT])
                        BTLenDict[BT]+=1
                else:
                    BTPosDict[chr][i]=[]
                    BTPosDict[chr][i].append(BTDict[BT])
                    BTLenDict[BT]+=1
        if IntronicDict.has_key(chr):
            for i in IntronicDict[chr].keys():
                if IntronicDict[chr][i]==0:
                    del IntronicDict[chr][i]
                else:
                    BTLenDict['Intronic']+=1
        else:
            continue
        if GenomeCoverageDict.has_key(chr):
            for i in GenomeCoverageDict[chr].keys():
                c1+=1
                totalscore=int(GenomeCoverageDict[chr][i])
                if DepthDict['Genome'].has_key(totalscore):
                    DepthDict['Genome'][totalscore]+=1
                else:
                    DepthDict['Genome'][totalscore]=1
                if IntronicDict[chr].has_key(i):
                    if DepthDict['Intronic'].has_key(totalscore):
                        DepthDict['Intronic'][totalscore]+=1
                    else:
                        DepthDict['Intronic'][totalscore]=1
                elif BTPosDict[chr].has_key(i):
                    for BT in BTPosDict[chr][i]:
                        if DepthDict[BTRevDict[BT]].has_key(totalscore):
                            DepthDict[BTRevDict[BT]][totalscore]+=1
                        else:
                            DepthDict[BTRevDict[BT]][totalscore]=1
                else:
                    if DepthDict['Unannotated'].has_key(totalscore):
                        DepthDict['Unannotated'][totalscore]+=1
                    else:
                        DepthDict['Unannotated'][totalscore]=1
        else:
            pass
        del IntronicDict[chr]
        del BTPosDict[chr]
        del GenomeCoverageDict[chr]

    print c1, c2                
    outfile = open(outputfilename, 'w')

    outfile.write('#Cumulative coverage by dataset\n')
    outline = 'Genome\t-----'
    for dataset in datasets:
        outline=outline+'\t'+dataset
    outfile.write(outline +' \n')
    outline = 'Genome\t-----'
    for dataset in datasets:
        outline=outline+'\t'+str(CumCoverageDict[dataset])
    outfile.write(outline +' \n')
    outfile.write('#######################################\n')
    outfile.write('#BioType stats:\n')
    keys=DepthDict.keys()
    keys.sort()
    print keys
    for BT in keys:
        if BTLenDict.has_key(BT):
            pass
        else:
            continue
        covered=0
        coverageScores=DepthDict[BT].keys()
        coverageScores.sort()
        totalcounts=0
        for score in coverageScores:
            covered=covered+DepthDict[BT][score]
            totalcounts+=score*DepthDict[BT][score]
        outline=BT+'\t'+str(BTLenDict[BT])+'\t'+str(covered)+'\t'+str(totalcounts)
        outfile.write(outline +' \n')
    covered=0
    coverageScores=DepthDict['Unannotated'].keys()
    coverageScores.sort()
    totalcounts=0
    for score in coverageScores:
        covered=covered+DepthDict['Unannotated'][score]
        totalcounts+=score*DepthDict['Unannotated'][score]
    outline='Unannotated'+'\t'+'---'+'\t'+str(covered)+'\t'+str(totalcounts)
    outfile.write(outline +' \n')
    outfile.write('#######################################\n')
    outfile.write('#Coverage stats:\n')
    keys=DepthDict.keys()
    keys.sort()
    for BT in keys:
        outfile.write(BT+'\n')
        coverageScores=DepthDict[BT].keys()
        coverageScores.sort()
        for score in coverageScores:
            outline=str(score)+'\t'+str(DepthDict[BT][score])
            outfile.write(outline +' \n')
    outfile.close()

run()
