##################################
#                                #
# Last modified 7/15/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def run():

    if len(sys.argv) < 8:
        print 'usage: python %s genome regionfilename startField PhastConsDirectory Consthreshold minlength totalbp outputfilename [-addrequirement fieldID] ' % sys.argv[0]

        sys.exit(1)
    
    genome=sys.argv[1]
    bedfile = sys.argv[2]
    PhastConsDirectory = sys.argv[4]
    outfilename = sys.argv[8]
    threshold=float(sys.argv[5])
    minlength=float(sys.argv[6])
    totalbp=float(sys.argv[7])
    fieldID=int(sys.argv[3])

    hg = Genome(genome)

    doAddRequire=False
    if '-addrequirement' in sys.argv:
        doAddRequire=True
        doAddRequireFieldID=int(sys.argv[sys.argv.index('-addrequirement') + 1])
        print 'Requiring field', doAddRequireFieldID, 'to be non-empty'

    outfile = open(outfilename, 'w')

    regionDict={}
    consRegionDict={}
    listoflines = open(bedfile)
    lineslist = listoflines.readlines()
    for line in lineslist:
        if line[0]=='#':
            continue
        fields=line.split('\n')[0].split('\t')

        if doAddRequire and fields[doAddRequireFieldID]=='':
            continue
        chr=fields[fieldID]
        start=int(fields[fieldID+1])
        stop=int(fields[fieldID+2])
        if regionDict.has_key(chr):
            regionDict[chr].append((start,stop))
        else:
            consRegionDict[chr]=[]
            regionDict[chr]=[]
            regionDict[chr].append((start,stop))
    
    for chr in regionDict.keys():
        print 'processing', chr
        phastConsfile=PhastConsDirectory+'/'+chr+'.data'
        listoflines = open(phastConsfile)
        lineslist = listoflines.readlines()
        phastConsDict={}
        print len(lineslist)
        dummyDict={}
        for (start,stop) in regionDict[chr]:
            for i in range(start,stop):
                dummyDict[i]=''
        for line in lineslist:
            if line[0]=='f':
                currentPos=int(line.split('start=')[1].split(' ')[0])
            else:
                if dummyDict.has_key(currentPos):
                    phastConsDict[currentPos]=float(line.strip())
                currentPos=currentPos+1
        print len(phastConsDict)
        for (start,stop) in regionDict[chr]:
            currentPos=''
            for i in range(start,stop):
                if (phastConsDict.has_key(i) and phastConsDict[i]<threshold):
                    if currentPos!='':
                        if i>=currentPos+minlength:
                            consRegionDict[chr].append((currentPos,i))
                    currentPos=''
                elif (phastConsDict.has_key(i) and phastConsDict[i]>=threshold):
                    if currentPos=='':
                        currentPos=i
                    else:
                        continue
                else:
                    if currentPos!='':
                        if i>=currentPos+minlength:
                            consRegionDict[chr].append((currentPos,i))
                    currentPos=''

    totalSequenceLength=0
    for chr in consRegionDict.keys():
        chromosome = chr[3:len(chr)]
        print len(consRegionDict[chr])
        print 'getting sequence for', chr
        for (start,stop) in consRegionDict[chr]:
            totalSequenceLength=totalSequenceLength+(stop-start)
            if totalSequenceLength>totalbp:
                print 'Maximum sequence length reached'
                break
            else:
                sequence=hg.sequence(chromosome,start,stop-start)
                outfile.write('>'+chr+':'+str(start)+'-'+str(stop)+'\n')
                outfile.write(sequence)
                outfile.write('\n')

    outfile.close()
   
run()
