##################################
#                                #
# Last modified 08/24/2010       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s genome regionfilename startField PhastConsDirectory Consthreshold outputfilename [-reportEboxes] [-doNotRepeatMask] ' % sys.argv[0]

        sys.exit(1)
    
    genome=sys.argv[1]
    bedfile = sys.argv[2]
    fieldID=int(sys.argv[3])
    PhastConsDirectory = sys.argv[4]
    threshold=float(sys.argv[5])
    outfilename = sys.argv[6]

    hg = Genome(genome)

    doReportEboxes=False
    if '-reportEboxes' in sys.argv:
        doReportEboxes=True
        print 'will report conserved e-boxes'
    doNotRepeatMask=False
    if '-doNotRepeatMask' in sys.argv:
        doNotRepeatMask=True
        print 'will not mask repeats'

    outfile = open(outfilename, 'w')

    regionDict={}
    lineslist = open(bedfile)
    for line in lineslist:
        if line[0]=='#':
            continue
        fields=line.split('\n')[0].split('\t')
        chr=fields[fieldID]
        if 'random' in chr:
            continue
        start=int(fields[fieldID+1])
        stop=int(fields[fieldID+2])
        if regionDict.has_key(chr):
            regionDict[chr][(start,stop)]={}
            regionDict[chr][(start,stop)]
            regionDict[chr][(start,stop)]['line']=line.strip()
        else:
            regionDict[chr]={}
            regionDict[chr][(start,stop)]={}
            regionDict[chr][(start,stop)]
            regionDict[chr][(start,stop)]['line']=line.strip()

    outline='#Conservation threshold='+str(threshold)
    outfile.write(outline+'\n')
    outline='#AverageConservationScore and FractionAboveConsThreshold - last two fields\t'
    if doReportEboxes:
        outline=outline+'E-boxes and ConservedE-boxes - fields after conservation'
    outfile.write(outline+'\n')

    if doReportEboxes:
        Ebox=['CAGCTG','CAGGTG']
        for chr in regionDict.keys():
            chromosome = chr[3:len(chr)]
            for (start,stop) in regionDict[chr].keys():
                regionDict[chr][(start,stop)]['Eboxcoordinates']=[]
                sequence=hg.sequence(chromosome,start,regionDict[chr][(start,stop)]['coordinates'][1]-start)
                if doNotRepeatMask:
                    sequence=sequence.upper()
                partition1=sequence.split('CAGCTG')
                partition2=sequence.split('CAGGTG')
                if len(partition1)>1:
                    currentPos=start-6
                    for i in range(1,len(partition1)):
                        pos=len(partition1[i-1])+6
                        currentPos+=pos
                        regionDict[chr][(start,stop)]['Eboxcoordinates'].append(currentPos)
                if len(partition2)>1:
                    currentPos=start-6
                    for i in range(1,len(partition2)):
                        pos=len(partition2[i-1])+6
                        currentPos+=pos
                        regionDict[chr][(start,stop)]['Eboxcoordinates'].append(currentPos)
                print 'found', len(regionDict[chr][(start,stop)]['Eboxcoordinates']), 'E-boxes in', chr, start, regionDict[chr][(start,stop)]['coordinates'][1]

    files = os.listdir(PhastConsDirectory)
    for chr in regionDict.keys():
        print 'processing', chr
        for phastConsfile in files:
            if phastConsfile.split('.')[0]==chr:
                print 'opened file', phastConsfile
                path=PhastConsDirectory+'/'+phastConsfile
                lineslist = open(path)
                continue
        phastConsDict={}
        dummyDict={}
        for (start,stop) in regionDict[chr]:
            for i in range(start,stop):
                dummyDict[i]=''
        for line in lineslist:
            if line[0]=='f':
                currentPos=int(line.split('start=')[1].split(' ')[0])
            else:
                if dummyDict.has_key(currentPos):
                    phastConsDict[currentPos]=float(line.strip())
                currentPos=currentPos+1
        print len(phastConsDict)
        dummyDict={}
        for (start,stop) in regionDict[chr]:
            outline=regionDict[chr][(start,stop)]['line']
            ConsScore=0.0
            FractionConserved=0.0
            for i in range(start,stop):
                if phastConsDict.has_key(i):
                    ConsScore+=phastConsDict[i]
                    if phastConsDict[i]>=threshold:
                        FractionConserved+=1
            try:
                outline=outline+'\t'+str(ConsScore/(stop-start))+'\t'+str(FractionConserved/(stop-start))+'\t'
            except:
                print 'exception', chr, stop,start
                continue
            if doReportEboxes:
                outline=outline+str(len(regionDict[chr][(start,stop)]['Eboxcoordinates']))+'\t'
                consEboxes=0
                for pos in regionDict[chr][(start,stop)]['Eboxcoordinates']:
                    scores=[]
                    for i in range(pos,pos+6):
                        if phastConsDict.has_key(i):
                            scores.append(phastConsDict[i])
                    if len(scores)==6 and min(scores[0],scores[1],scores[2],scores[4],scores[5],)>=threshold:
                         consEboxes+=1
                outline=outline+str(consEboxes)
            outfile.write(outline+'\n')

    outfile.close()
   
run()
