##################################
#                                #
# Last modified 8/14/2009        # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import re
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    print 'for some reason when displayed on the browser all E-boxes appear shifted by 5bp to the right of the actual Ebox sequence, so a 5bp shift to the left has been implemented in the code'
    if len(sys.argv) < 3:
        print 'usage: python %s label genomeDirectory outputfilename [-conserved PhastConsDirectory Consthreshold [CAGSTG | strict]] [-doNotRepeatMask] ' % sys.argv[0]

        sys.exit(1)
    
    label=sys.argv[1]
    genomeDir = sys.argv[2]
    outfilename = sys.argv[3]

    doConserved=False
    if '-conserved' in sys.argv:
        doConserved=True
        PhastConsDirectory = sys.argv[sys.argv.index('-conserved') + 1]
        threshold=float(sys.argv[sys.argv.index('-conserved') + 2])
        conservation=sys.argv[sys.argv.index('-conserved') + 3]
        print 'will report conserved e-boxes'
    doNotRepeatMask=False
    if '-doNotRepeatMask' in sys.argv:
        doNotRepeatMask=True
        print 'will not mask repeats'

    SenseColor="255,0,0"
    AntiSenseColor="0,0,255"
    Color="0,0,0"


    m = re.compile('CA(GG|GC|CC)TG')

    outfile = open(outfilename, 'w')

    outfile.write('track name="%s" visibility=4 itemRgb="On"\n' % (label))

    files = os.listdir(genomeDir)
    files.sort()
    i=0
    for file in files:
        if file[0:3]!='chr':
            print 'skipping', file
            continue
        print file
        chromosome=file.split('.')[0]
        chrseq=open(genomeDir+'/'+file)
        seq=chrseq.read()
        seq=seq.replace('\n','')
        if doNotRepeatMask:
            seq=seq.upper()
        if doConserved:
            consfiles = os.listdir(PhastConsDirectory)
            for phastConsfile in consfiles:
                if phastConsfile.split('.')[0]==chromosome:
                    print 'opened file', phastConsfile
                    path=PhastConsDirectory+'/'+phastConsfile
                    listoflines = open(path)
                    continue
            lineslist = listoflines.readlines()
            phastConsDict={}
            for line in lineslist:
                if line[0]=='f':
                    currentPos=int(line.split('start=')[1].split(' ')[0])
                else:
                    currentPos=currentPos+1
                    if float(line.strip())>threshold:
                       phastConsDict[currentPos]=float(line.strip())
        weight=1
        k=0
        if doConserved:
            for mo in m.finditer(seq):
                if conservation=='CAGSTG' and phastConsDict.has_key(mo.start()):
                    if phastConsDict.has_key(mo.start()+1):
                        if phastConsDict.has_key(mo.start()+2):
                            if phastConsDict.has_key(mo.start()+4):
                                if phastConsDict.has_key(mo.start()+5):
                                    i+=1
                                    k+=1
                                    readID='ConservedE-box'+str(i)
                                    if seq[mo.start():mo.end()]=='CAGCTG': 
                                        sense='+'
                                        color=Color
                                    if seq[mo.start():mo.end()]=='CAGGTG':
                                        sense='+'
                                        color=SenseColor
                                    if seq[mo.start():mo.end()]=='CACCTG':
                                        sense='-'
                                        color=AntiSenseColor
                                    outfile.write('%s %d %d %s %.1f %s 0 0 %s\n' % (chromosome, mo.start()-6, mo.end()-6, readID, weight, sense, color))
                if conservation=='strict' and phastConsDict.has_key(mo.start()):
                    if phastConsDict.has_key(mo.start()+1):
                        if phastConsDict.has_key(mo.start()+2):
                            if phastConsDict.has_key(mo.start()+4):
                                if phastConsDict.has_key(mo.start()+5):
                                    if phastConsDict.has_key(mo.start()):
                                        i+=1
                                        k+=1
                                        readID='ConservedE-box'+str(i)
                                        if seq[mo.start():mo.end()]=='CAGCTG': 
                                            sense='+'
                                            color=Color
                                        if seq[mo.start():mo.end()]=='CAGGTG':
                                            sense='+'
                                            color=SenseColor
                                        if seq[mo.start():mo.end()]=='CACCTG':
                                            sense='-'
                                            color=AntiSenseColor
                                        outfile.write('%s %d %d %s %.1f %s 0 0 %s\n' % (chromosome, mo.start()-6, mo.end()-6, readID, weight, sense, color))
        else:
            for mo in m.finditer(seq):
                i+=1
                k+=1
                readID='E-box'+str(i)
                if seq[mo.start():mo.end()]=='CAGCTG': 
                    sense='+'
                    color=Color
                if seq[mo.start():mo.end()]=='CAGGTG':
                    sense='+'
                    color=SenseColor
                if seq[mo.start():mo.end()]=='CACCTG':
                    sense='-'
                    color=AntiSenseColor
                outfile.write('%s %d %d %s %.1f %s 0 0 %s\n' % (chromosome, mo.start()-6, mo.end()-6, readID, weight, sense, color))
        print 'found', k, 'E-boxes on', chromosome
        



    outfile.close()
   
run()
