##################################
#                                #
# Last modified 03/04/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s  <gtf file> <chrom sizes> <data table with FPKMs> <transcriptID fieldID> <data fields> <FPKM thresholds> outfilename [-chr chr1,...,chrN]' % sys.argv[0]
        print '	data table: spreadsheet with abundance values for each elements in each dataset'
        print '	data fields format: any combination of comma separated values and from-to intervals (both ends included), i.e. for example: 3,4,5,8-11,15, etc.'
        print '	FPKM thresholds - comma-separated'
        sys.exit(1)

    GTF = sys.argv[1]
    chromsizes = sys.argv[2]
    datafilename = sys.argv[3]

    transcriptFieldID = int(sys.argv[4])

    DataFields=[]
    fields = sys.argv[5].split(',')
    for ID in fields:
        if '-' in ID:
            start = int(ID.split('-')[0])
            end = int(ID.split('-')[1])
            for i in range(start,end+1):
                DataFields.append(i)
        else:
            DataFields.append(int(ID))
    DataFields.sort()

    CutoffValues=[]
    fields = sys.argv[6].split(',')
    for ID in fields:
        CutoffValues.append(float(ID))
    CutoffValues.append(0)
    CutoffValues=list(Set(CutoffValues))
    CutoffValues.sort()

    doSubsetChr = False
    if '-chr' in sys.argv:
        doSubsetChr = True
        WantedChr={}
        chromosomes = sys.argv[sys.argv.index('-chr')+1].split(',')
        for chr in chromosomes:
            WantedChr[chr]=0


    outfilename = sys.argv[7]

    ChrDict={}
    lineslist  = open(chromsizes)
    i=0
    for line in lineslist:
        fields=line.strip().split('\t')
        chr = fields[0]
        if doSubsetChr:
            if WantedChr.has_key(chr):
                pass
            else:
                continue
        length = int(fields[1])
        ChrDict[chr] = length

    print ChrDict
 
    TranscriptsDict={}
    lineslist  = open(GTF)
    i=0
    for line in lineslist:
        if line.startswith('#'):
            continue
        i+=1
        if i % 100000 == 0:
            print i
        fields=line.strip().split('\t')
        chr = fields[0]
        if doSubsetChr:
            if WantedChr.has_key(chr):
                pass
            else:
                continue
        if TranscriptsDict.has_key(chr):
            pass
        else:
            TranscriptsDict[chr]={}
        if fields[2]!= 'exon':
            continue
        left = int(fields[3])
        right = int(fields[4])
        strand = fields[6]
        transcriptID = fields[8].split('transcript_id "')[1].split('";')[0]
        if TranscriptsDict[chr].has_key(transcriptID):
            pass
        else:
            TranscriptsDict[chr][transcriptID]=[]
        TranscriptsDict[chr][transcriptID].append((left,right,strand))

    print 'finshed parsing GTF file'

    ExpressionDict={}

    lineslist  = open(datafilename)
    i=0
    for line in lineslist:
        if line.startswith('#'):
            continue
        i+=1
        if i % 100000 == 0:
            print i
        fields=line.strip().split('\t')
        if fields[0] == 'tracking_id':
            continue
        transcriptID = fields[transcriptFieldID]
        ExpressionDict[transcriptID] = []
        for fieldID in DataFields:
            ExpressionDict[transcriptID].append(float(fields[fieldID].split(',')[0]))

    print 'finshed parsing expression values'

    outfile = open(outfilename, 'w')

    chrKeys = ChrDict.keys()
    chrKeys.sort()

    missing=0

    for chr in chrKeys:
        outline = chr + '\t' + 'Total_chr_Length'
        for FPKM in CutoffValues:
            outline = outline + '\t' + str(FPKM)
        outfile.write(outline + '\n')
        print outline
        outline1 = 'Exons' + '\t' + str(ChrDict[chr]) + '\t'
        outline2 = 'Exons+Introns' + '\t' + str(ChrDict[chr]) + '\t'
        for FPKM in CutoffValues:
            if TranscriptsDict.has_key(chr):
                pass
            else:
                outline1 = outline1 + '\t0'
                outline2 = outline2 + '\t0'
                continue
            CoveredBPListExonic={}
            CoveredBPListExonicIntronic={}
            for transcriptID in TranscriptsDict[chr].keys():
                if ExpressionDict.has_key(transcriptID):
                    pass
                else:
                    missing+=1
                    print 'trancript', transcriptID, 'not found in expression table;', missing, 'transcripts not found in total'
                    continue
                TranscriptsDict[chr][transcriptID].sort()
                if max(ExpressionDict[transcriptID]) >= FPKM:
                    for (left,right,strand) in TranscriptsDict[chr][transcriptID]:
#                        print left, right
                        for i in range(left,right):
                            CoveredBPListExonic[i]=0
                    for i in range(TranscriptsDict[chr][transcriptID][0][0],TranscriptsDict[chr][transcriptID][-1][1]):
                        CoveredBPListExonicIntronic[i]=0
            outline1 = outline1 + '\t' + str(len(CoveredBPListExonic.keys()))
            outline2 = outline2 + '\t' + str(len(CoveredBPListExonicIntronic.keys()))
        outfile.write(outline1 + '\n')
        outfile.write(outline2 + '\n')
        print outline1
        print outline2

    outfile.close()
        
run()

