##################################
#                                #
# Last modified 03/02/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s genome rdsfilename TSS_Window_Size outputfilename [-geneBody bp] [-geneBodyStart positiondonwstreamofTSS] [-control controlrdsfilename] [-GENCODE gencode.gtf] [-refFlat refFlat.txt] [-UCSC knownGene] [-cache size] [-RPM] [-subtractControl]' % sys.argv[0]
        print "       Note: if you use the -control option, you will get the ratio of control over ChIP over the entire region, if not the RPKMs will be reported"

        sys.exit(1)
    
    genome = sys.argv[1]
    hitfilename = sys.argv[2]
    window=int(sys.argv[3])
    outfilename = sys.argv[4]

    outfile = open(outfilename, 'w')

    GeneBodyStartPos=int(window/2.)
    if '-geneBodyStart' in sys.argv:
        GeneBodyStartPos=int(sys.argv[sys.argv.index('-geneBodyStart') + 1])

    doFixedDownStreamEnd=False
    if '-geneBody' in sys.argv:
        geneBody=int(sys.argv[sys.argv.index('-geneBody') + 1])
        doFixedDownStreamEnd=True

    doRPM=False
    if '-RPM' in sys.argv:
        doRPM=True

    doControlSubtract=False
    if '-subtractControl' in sys.argv:
        doControlSubtract=True

    cachePages = -1
    doCache = False
    if '-cache' in sys.argv:
        doCache = True
        cachePages =  int(sys.argv[sys.argv.index('-cache') + 1])
    hitRDS = readDataset(hitfilename, verbose = True, cache=True)


    doCISTEMATIC=True
    doGENCODE=False
    doUCSC=False
    doRefFlat=False

    if '-refFlat' in sys.argv:
        doRefFlat=True
        doGENCODE=False
        doCISTEMATIC=False
        doUCSC=False
        print 'will use annotation supplied in refFalt file'
        refFlat=sys.argv[sys.argv.index('-refFlat') + 1]


    if '-GENCODE' in sys.argv:
        doGENCODE=True
        doCISTEMATIC=False
        doUCSC=False
        doRefFlat=False
        print 'will use GENCODE annotation'
        GENCODEgenelist=sys.argv[sys.argv.index('-GENCODE') + 1]

    if '-UCSC' in sys.argv:
        doGENCODE=False
        doCISTEMATIC=False
        doRefFlat=False
        doUCSC=True
        print 'will use UCSC annotation'
        knownGene=sys.argv[sys.argv.index('-UCSC') + 1]

    doControl=False
    if '-control' in sys.argv:
        doControl=True
        controlrdsfilename = sys.argv[sys.argv.index('-control') + 1]
        hitCtrlRDS = readDataset(controlrdsfilename, verbose = True, cache=doCache)
        ctrlnormalizeBy = len(hitCtrlRDS) / 1000000.

    #sqlite default_cache_size is 2000 pages
    if cachePages > hitRDS.getDefaultCacheSize():
        hitRDS.setDBcache(cachePages)
    if doControl and cachePages > hitCtrlRDS.getDefaultCacheSize():
        hitCtrlRDS.setDBcache(cachePages)

    metadata = hitRDS.getMetadata()
    dataType = metadata['dataType']
    if doControl:
        metadata = hitCtrlRDS.getMetadata()
        dataType = metadata['dataType']
    normalizeBy = len(hitRDS) / 1000000.

    genes = {}

    i=0
    if doUCSC:
        if doRPM:
            if doControl:
                outfile.write('#GeneID\tGeneName\tChr\tStart\tEnd\tOrientation\tTSSRPM\tGeneBodyRPM\tTSSEnrichment\tGeneBodyEnrichment\tRPMratio\tEnrichmentRatio\n')
            else:
                outfile.write('#GeneID\tGeneName\tChr\tStart\tEnd\tOrientation\tTSSRPM\tGeneBodyRPM\tRPMratio\n')
        else:
            if doControl:
                outfile.write('#GeneID\tGeneName\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tTSSEnrichment\tGeneBodyEnrichment\tRPKMratio\tEnrichmentRatio\n')
            else:
                outfile.write('#GeneID\tGeneName\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tRPKMratio\n')
        print 'processing UCSC annotation'
        i=0
        inputdatafile = open(knownGene)
        lineslist = inputdatafile.readlines()
        for line in lineslist:
            i+=1
            if i % 1000 == 0:
                print i
            fields=line.strip().split('\t')
            chr=fields[1]
            name=fields[0]
            orientation=fields[2]
            leftPos=int(fields[3])
            rightPos=int(fields[4])
            if rightPos-leftPos < window+2*GeneBodyStartPos:
                print 'gene', name, leftPos, rightPos, 'too short, skipping' 
                continue
            if orientation == 'F' or orientation == '+':
                TSS=leftPos
                GeneEnd=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS+GeneBodyStartPos
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (name, name, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSRPKM/GeneBodyRPKM, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (name, name, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM)
            if orientation == 'R' or orientation == '-':
                GeneEnd=leftPos
                TSS=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS-GeneBodyStartPos
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (name, name, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSRPKM/GeneBodyRPKM, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (name, name, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM)
            outfile.write(outline + '\n')
    if doGENCODE:
        if doControl:
            outfile.write('#GeneName\tTranscriptName\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tStallingRatio\tTSSEnrichment\tGeneBodyEnrichment\tEnrichmentRatio\n')
        else:
            outfile.write('#GeneName\tTranscriptName\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tStallingRatio\n')
        print 'processing GENCODE annotation'
        i=0
        lineslist = open(GENCODEgenelist)
        for line in lineslist:
            i+=1
            if i % 100000 == 0:
                print i
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            if 'rRNA' in fields[8] or '7SK' in fields[8]:
                continue
            if fields[2]!='transcript':
                continue
            chr=fields[0]
            GeneName=fields[8].split('gene_name "')[1].split('";')[0]
            TranscriptName=fields[8].split('transcript_name "')[1].split('";')[0]
            orientation=fields[6]
            leftPos=int(fields[3])
            rightPos=int(fields[4])
            if rightPos-leftPos < window+2*GeneBodyStartPos:
                continue
            if orientation == 'F' or orientation == '+':
                TSS=leftPos
                GeneEnd=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS+GeneBodyStartPos
                if doFixedDownStreamEnd:
                    GeneEnd=GeneStart+geneBody
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (GeneName, TranscriptName, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s' % (GeneName, TranscriptName, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, (TSSRPKM/GeneBodyRPKM))
            if orientation == 'R' or orientation == '-':
                GeneEnd=leftPos
                TSS=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS-GeneBodyStartPos
                if doFixedDownStreamEnd:
                    GeneEnd=GeneStart-geneBody
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (GeneName, TranscriptName, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s' % (GeneName, TranscriptName, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM)
            outfile.write(outline + '\n')
    if doCISTEMATIC:
        print 'processing CISTEMATIC annotation'
        hg = Genome(genome)
        idb = geneinfoDB()
        geneinfoDict = idb.getallGeneInfo(genome)
        featDict = hg.getallGeneFeatures()
        geneIDs = featDict.keys()
        for k in featDict.keys():
            if i % 1000 == 0:
                print len(featDict.keys())-i 
            i+=1
            start=0
            stop=0
            if idb.getGeneInfo((genome,k))==[]:
                name = 'LOC'+str(k)
            else:
                name = idb.getGeneInfo((genome,k))[0]
            genes[name]={}
            leftPos=[]
            rightPos=[]
            for feature in featDict[k]:
                leftPos.append(int(feature[2]))
                rightPos.append(int(feature[3]))
            chr= 'chr'+str(featDict[k][0][1])
            orientation=str(featDict[k][0][4])
            if orientation == 'F' or orientation == '+':
                TSS=min(leftPos)
                GeneEnd=max(rightPos)
                if GeneEnd-TSS < window+2*GeneBodyStartPos:
                    print 'gene', name, 'too short, skipping'
                    continue
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS+GeneBodyStartPos
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (k, name, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (k, name, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM)
            if orientation == 'R' or orientation == '-':
                GeneEnd=min(leftPos)
                TSS=max(rightPos)
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                if TSS-GeneEnd < window+2*GeneBodyStartPos:
                    print 'gene', name, 'too short, skipping'
                    continue
                GeneStart=TSS-GeneBodyStartPos
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (k, name, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (k, name, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM)
            outfile.write(outline + '\n')
    if doRefFlat:
        if doControl:
            outfile.write('#Name\tID\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tStallingRatio\tTSSEnrichment\tGeneBodyEnrichment\tEnrichmentRatio\n')
        else:
            outfile.write('#Name\tID\tChr\tStart\tEnd\tOrientation\tTSSRPKM\tGeneBodyRPKM\tStallingRatio\n')
        print 'processing refFlat file'
        i=0
        lineslist = open(refFlat)
        for line in lineslist:
            i+=1
            if i % 10000 == 0:
                print i
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            chr=fields[2]
            Name=fields[0]
            ID=fields[1]
            orientation=fields[3]
            leftPos=int(fields[4])
            rightPos=int(fields[5])
            if rightPos-leftPos < window+2*GeneBodyStartPos:
                continue
            if orientation == 'F' or orientation == '+':
                TSS=leftPos
                GeneEnd=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS+GeneBodyStartPos
                if doFixedDownStreamEnd:
                    GeneEnd=GeneStart+geneBody
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (Name, ID, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneStart, rmax=GeneEnd, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneEnd-GeneStart)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s' % (Name, ID, chr, TSS, GeneEnd, orientation, TSSRPKM, GeneBodyRPKM, (TSSRPKM/GeneBodyRPKM))
            if orientation == 'R' or orientation == '-':
                GeneEnd=leftPos
                TSS=rightPos
                TSSleft=TSS-int(window/2.)
                TSSright=TSS+int(window/2.)
                GeneStart=TSS-GeneBodyStartPos
                if doFixedDownStreamEnd:
                    GeneEnd=GeneStart-geneBody
                if doControl:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            TSSRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            TSSRPKM=v1/normalizeBy
                    TSSEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    v2=1. + hitCtrlRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        if doControlSubtract:
                            GeneBodyRPKM=max(0.01,(v1/normalizeBy - v2/ctrlnormalizeBy))
                        else:
                            GeneBodyRPKM=v1/normalizeBy
                    GeneBodyEnrichment=(v1/normalizeBy)/(v2/ctrlnormalizeBy)
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t' % (Name, ID, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM, TSSEnrichment, GeneBodyEnrichment, TSSEnrichment/GeneBodyEnrichment)
                else:
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=TSSleft, rmax=TSSright, uniqs=True, multi=True, splices=False, reportCombined=True)
                    TSSRPKM=v1/(normalizeBy*(window/1000.0))
                    if doRPM:
                        TSSRPKM=v1/normalizeBy
                    v1=1. + hitRDS.getCounts(chrom=chr, rmin=GeneEnd, rmax=GeneStart, uniqs=True, multi=True, splices=False, reportCombined=True)
                    GeneBodyRPKM=v1/(normalizeBy*((GeneStart-GeneEnd)/1000.0))
                    if doRPM:
                        GeneBodyRPKM=v1/normalizeBy
                    outline = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s' % (Name, ID, chr, GeneEnd, TSS, orientation, TSSRPKM, GeneBodyRPKM, TSSRPKM/GeneBodyRPKM)
            outfile.write(outline + '\n')

    outfile.close()
   
run()
