###########################################################################
#                                                                         #
# C O P Y R I G H T   N O T I C E                                         #
#  Copyright (c) 2003-10 by:                                              #
#    * California Institute of Technology                                 #
#                                                                         #
#    All Rights Reserved.                                                 #
#                                                                         #
# Permission is hereby granted, free of charge, to any person             #
# obtaining a copy of this software and associated documentation files    #
# (the "Software"), to deal in the Software without restriction,          #
# including without limitation the rights to use, copy, modify, merge,    #
# publish, distribute, sublicense, and/or sell copies of the Software,    #
# and to permit persons to whom the Software is furnished to do so,       #
# subject to the following conditions:                                    #
#                                                                         #
# The above copyright notice and this permission notice shall be          #
# included in all copies or substantial portions of the Software.         #
#                                                                         #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
# SOFTWARE.                                                               #
###########################################################################
#
# This class contains the core code for using orthology and sequence-level conservation.
try:
    from pysqlite2 import dbapi2 as sqlite
except:
    from sqlite3 import dbapi2 as sqlite

import string, cistematic.core
from cistematic.core.homology import homologyDB, homologeneGenomes
from cistematic.programs.mafft import Mafft
from cistematic.programs.paircomp import Paircomp
from cistematic.programs.fastcomp import Fastcomp


class Conservation:
    """ The conservation class holds the conservation specific code for
        specifying orthology/paralogy, calling conservation identification code, 
        and manipulating/storing conserved sequences. It is meant to be used 
        in conjuction with the Experiment and AnalyzeMotifs classes.
    """
    conservationID = "default"
    consDBName = ""
    startingGenome = ""
    targetGenomes = []
    refGenes = []


    def setTargetGenomes(self, tGenomes):
        """ restricts homologs to the genomes specified in the tGenomes list.
        """
        self.targetGenomes = tGenomes


    def setRefGenes(self, rGenes):
        """ sets the list of loci (not geneIDs) that will be compared to homologs.
        """
        self.refGenes = rGenes


    def setStartingGenome(self, sGenome):
        """ sets the genome associated with the refGenes.
        """
        self.startingGenome = sGenome


    def importHomology(self, inputFile, source="generic"):
        """ loads homology relationships from a tab-delimited file of the form:
            homologyGroup orthologyGroup genome locus
            where orthologyGroup can be left blank.
        """
        stmtList = []
        importFile = open(inputFile, "r")
        stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) "
        for line in importFile:
            (homGroup, orthGroup, genome, locus) = line.split("\t")
            stmtList.append((self.conservationID, homGroup, orthGroup, genome, locus, source))     

        importFile.close()
        self.batchsqlcons(stmt, stmtList)


    def insertHomologs(self, geneIDList, homGroup="", orthGroup="", source="generic"):
        """ define the geneIDs in geneIDList as being homologous.
        """
        stmtList = []

        if homGroup == "":
            for tempID in geneIDList:
                tempHomGroup = "%s-%s" % (str(tempID[0]), str(tempID[1]))
                if not self.hasHomGroup(tempHomGroup):
                    homGroup = tempHomGroup
                    break

            if homGroup == "":
                self.mlog("could not add %s without a homGroup - potential conflicts for automated naming" % str(geneIDList))
                return ""

        stmt = "INSERT into homology(ID, conservationID, homologyGroup, orthologyGroup, genome, locus, source) values (NULL, ?, ?, ?, ?, ?, ?) " 
        for geneID in geneIDList:
            stmtList.append((self.conservationID, homGroup, orthGroup, geneID[0], geneID[1], source))

        self.batchsqlCons(stmt, stmtList)
        return homGroup


    def deleteHomolog(self, geneID, homGroup=""):
        """ delete all entries from the homology table for a given geneID and homGroup.
        """
        stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, homGroup, geneID[0], geneID[1])
        self.sqlcons(stmt, "commit")


    def deleteOrtholog(self, geneID, orthGroup):
        """ delete all entries from the homology table for a given geneID and orthGroup.
        """
        stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' and genome = '%s' and locus = '%s' " % (self.conservationID, orthGroup, geneID[0], geneID[1])
        self.sqlcons(stmt, "commit")


    def deleteHomologyGroup(self, homGroup):
        """ delete all entries from the homology table matching the given homology group.
        """
        stmt = "DELETE from homology where conservartion ID = '%s' and homologyGroup = '%s'  " % (self.conservationID, homGroup)
        self.sqlcons(stmt, "commit")


    def deleteOrthologyGroup(self, orthGroup):
        """ delete all entries from the homology table matching the given orthology group.
        """
        stmt = "DELETE from homology where conservartion ID = '%s' and orthologyGroup = '%s' " % (self.conservationID, orthGroup)
        self.sqlcons(stmt, "commit")


    def hasHomGroup(self, testGroup):
        """ check for the existence of testGroup as an entry in the homologyGroup column in the homology table.
        """
        stmt = "select count(*) from homology where homologyGroup = '%s'" % testGroup
        res = self.sqlcons(stmt)
        if int(res[0]) > 0:
            return True

        return False


    def returnHomologs(self, geneID):
        """ returns a list of genes that are homologous (orthologs and paralogs) to a geneID and whose genome are in 
            self.targetGenomes, based on entries in the homology table. This function will try to load entries from 
            homologene for supported genomes, if necessary.
        """
        homologList = []
        usedHomologene = False
        stmt = "SELECT ID, homologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
        groups = self.sqlcons(stmt)

        if len(groups) < 1 and geneID[0] in homologeneGenomes:
            self.loadFromHomologene(geneID)
            groups = self.sqlcons(stmt)
            usedHomologene = True

        for (recID, hIDentry) in groups:
            stmt = "select genome, locus from homology where homologyGroup = '%s' and ID != '%s'  " % (str(hIDentry), str(recID))
            genes = self.sqlcons(stmt)
            for gene in genes:
                strgene = (str(gene[0]), str(gene[1]))
                if usedHomologene and strgene[0] not in self.targetGenomes:
                    continue

                if strgene not in homologList:
                    homologList.append(strgene)

        return homologList


    def returnOrthologs(self, geneID):
        """ returns a list of genes that are orthologous to a geneID and whose genome are in 
            self.targetGenomes, based on explicit entries in the homology table. 
        """
        orthologList = []
        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID
        groups = self.sqlcons(stmt)

        for (recID, oIDentry) in groups:
            if oIDentry == "":
                continue
            stmt = "select genome, locus from homology where orthologyGroup = '%s' and ID != '%s' " % (str(oIDentry), str(recID))
            genes = self.sqlcons(stmt)
            for gene in genes:
                strgene = (str(gene[0]), str(gene[1]))
                if strgene[0] in self.targetGenomes and strgene not in orthologList:
                    orthologList.append(strgene)

        return orthologList


    def areOrthologs(self, geneID1, geneID2):
        """ returns True if genes geneID1 and geneID2 are orthologs.
        """
        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID1
        groups1 = self.sqlcons(stmt)

        stmt = "select ID, orthologyGroup from homology where genome = '%s' and locus = '%s'" % geneID2
        groups2 = self.sqlcons(stmt)

        oEntries1 = []
        oEntries2 = []
        for (recID, oIDentry) in groups1:
            oEntries1.append(oIDentry)

        for (recID, oIDentry) in groups2:
            oEntries2.append(oIDentry)

        for entry in oEntries1:
            if entry in oEntries2:
                return True

        return False


    def loadFromHomologene(self, geneID):
        """ load the homologous genes to geneID from homologene.
        """
        try:
            hdb = homologyDB(self.targetGenomes)
            hGenes = hdb.getHomologousGenes(geneID)
        except:
            hdb = homologyDB(self.targetGenomes, cache=True)
            hGenes = hdb.getHomologousGenes(geneID)

        hGenes.append(geneID)
        homologyGroup = "%s-%s" % (str(geneID[0]), str(geneID[1]))
        self.insertHomologs(hGenes, homologyGroup, orthGroup="", source="homologene")


    def computeAlignments(self, geneIDList=[]):
        """ use Mafft() to calculate multiple sequence alignement (MSA) for all the homologs of geneIDList or of 
            all self.refGenes. datasetID handle (i.e. homGroup) for each group of MSA is of the form
            genome-locus' from the geneIDs in the starting genome.
        """
        if len(geneIDList) < 1:
            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]

        prog = Mafft()
        for geneID in geneIDList:
            homGroup =  str(geneID[0]) + '-' + str(geneID[1])
            hGenes = self.returnHomologs(geneID)
            hGenes.append(geneID)
            fastaFile = self.createDataFile(geneIDList = hGenes)
            prog.inputFile(fastaFile)
            prog.run()
            alignedDict = prog.getAlignment()
            for dictKey in alignedDict:
                aGeneID = dictKey.split('-')
                self.insertAlignedSequence(homGroup, aGeneID, alignedDict[dictKey])


    def insertAlignedSequence(self, datasetID, geneID, seq):
        """ save an aligned sequence into alignedSequence.
        """
        values = "(NULL, '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], seq)
        stmt = "INSERT into alignedSequence(ID, conservationID, datasetID, genome, locus, sequence) values %s" % values
        self.sqlcons(stmt, "commit")


    def getAlignedSequence(self, geneID, datasetID=""):
        """ retrieve an aligned sequence from alignedSequence using datasetID and geneID.
        """
        stmt = "SELECT sequence from alignedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
        if len(datasetID) > 0:
            stmt += "and datasetID = '%s' " % (datasetID)

        res = self.sqlcons(stmt)

        return str(res[0][0])


    def mapAlignmentConservation(self, strict=False, minConsLength=3, geneIDList=[]):
        """ map regions of sequences where multiple alignments show at least one other (default)
            or all genes (stritct=True) as lining up.
        """
        if len(geneIDList) < 1:
            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]

        if strict:
            criteria = "strict"
        else:
            criteria = "partial"

        for geneID in geneIDList:
            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
            hGenes = self.returnHomologs(geneID)
            hGenes.append(geneID)
            if len(hGenes) < 2:
                continue

            maskedSequences = self.maskUsingConservation(hGenes, strict)
            for gID in hGenes:
                start = 0
                consLength = 0
                for pos in range(len(maskedSequences[gID])):
                    if maskedSequences[gID][pos] == "N":
                        if start != 0 and consLength >= minConsLength:
                            self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)

                        start = 0
                        consLength = 0
                        continue

                    if start == 0:
                        start = pos

                    consLength += 1

                if start != 0 and consLength >= minConsLength:
                    self.insertConservedSequence(homGroup, gID, start, consLength, "mafft", criteria)


    def maskUsingConservation(self, geneIDList, strict=False):
        """ mask every gene in geneIDList using conservation amongst themselves, which have 
            already been computed using computeAlignments()
        """
        alignmentDict = {}
        maskedDict = {}

        for gID in geneIDList:
            seqLen = 0
            try:
                alignmentDict[gID] = self.getAlignedSequence(gID)
                maskedDict[gID] = ""
                seqLen = len(alignmentDict[gID])
            except:
                return maskedDict

        if strict:
            criteria = len(geneIDList)
        else:
            criteria = 2

        for pos in range(seqLen):
            posDict = {}
            conserved = 0
            for geneID in geneIDList:
                posDict[geneID] = alignmentDict[geneID][pos]
                if posDict[geneID] != "-" and posDict[geneID] != "N":
                    conserved += 1

            if conserved >= criteria:
                for geneID in geneIDList:
                    if posDict[geneID] != "-":
                        maskedDict[geneID] += posDict[geneID]
            else:
                for geneID in geneIDList:
                    if posDict[geneID] != "-":
                        maskedDict[geneID] += "N"

        return maskedDict


    def mapSeqcompConservation(self, window=20, threshold=0.9, orthologyThreshold=0.0, minSequences=2, geneIDList=[], useFastcomp=False):
        """ map regions of sequences with seqcomp windows in all genes that have 
            more than threshold (<= 1) conservation in more than minSequences sequences.
            Can optionally use a higher threshold for orothlogs if specified using 
            orthologyThreshold.
        """
        consList = []
        if orthologyThreshold < threshold:
            orthologyThreshold = threshold

        if len(geneIDList) < 1:
            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]

        if useFastcomp:
            prog = Fastcomp()
        else:
            prog = Paircomp()

        prog.setWindowSize(window)
        # if we have a window on a sequence, then we have at least one match!
        minSeqNum = minSequences - 1
        for geneID in geneIDList:
            genePairs = []
            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
            hGenes = self.returnHomologs(geneID)
            hGenes.append(geneID)
            for first in hGenes:
                for second in hGenes:
                    if first != second and [first, second] not in genePairs and [second, first] not in genePairs:
                        genePairs.append([first, second])

            seqcompWindows = {}
            print "genepairs = %s" % str(genePairs)
            for pair in genePairs:
                fastaFile = self.createDataFile(geneIDList = pair)
                if self.areOrthologs(pair[0], pair[1]):
                    prog.setThreshold(orthologyThreshold)
                else:
                    prog.setThreshold(threshold)

                prog.inputFile(fastaFile)
                prog.run()
                seqcompWindows[(pair[0], pair[1])] = prog.getWindows()

            resultWindows = {}
            for gene in hGenes:
                resultWindows[gene] = {}

            for pair in seqcompWindows:
                (geneID1, geneID2) = pair
                for (seq1pos, seq2pos, matches, sense) in seqcompWindows[pair]:
                    if seq1pos not in resultWindows[geneID1]:
                        resultWindows[geneID1][seq1pos] = []

                    if seq2pos not in resultWindows[geneID2]:
                        resultWindows[geneID2][seq2pos] = []

                    resultWindows[geneID1][seq1pos].append((geneID2, seq2pos, sense, matches))
                    resultWindows[geneID2][seq2pos].append((geneID1, seq1pos, sense, matches))

            for geneID in hGenes:
                for position in resultWindows[geneID]:
                    otherGeneIDs = []
                    for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
                        if geneID2 not in otherGeneIDs:
                            otherGeneIDs.append(geneID2)

                    if len(otherGeneIDs) >= minSeqNum:
                        criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], position, "1", window)
                        for (geneID2, seq2pos, sense, matches) in resultWindows[geneID][position]:
                            criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)

                        consList.append((homGroup, geneID, position, window, "seqcompCons", criteria))

        self.insertConservedSequenceBatch(consList)


    def mapMussaConservation(self, window=20, threshold=0.9, geneIDList=[], useFastcomp=False):
        """ map regions of sequences with seqcomp windows in all genes that have 
            more than threshold (<= 1) conservation. Uses transivity.
        """
        self.mapMOREMConservation(window, threshold, threshold, "mussa", geneIDList, useFastcomp)


    def mapMOREMConservation(self, window=20, orthologyThreshold=0.9, paralogyThreshold=0.7, tag="MOREM", geneIDList=[], useFastcomp=False):
        """ Implements the "Moral Equivalent of Mussa" algorithm. The function map regions of 
            sequences with seqcomp windows in all genes that have more than orthologyThreshold (<= 1) 
            conservation in orthologs and more than paralogyThreshold in paralogs. Uses transivity.
        """
        consList = []
        if len(geneIDList) < 1:
            geneIDList = [(self.startingGenome, gene) for gene in self.refGenes]

        if useFastcomp:
            prog = Fastcomp()
        else:
            prog = Paircomp()

        prog.setWindowSize(window)
        for geneID in geneIDList:
            genePairs = []
            homGroup =  "%s-%s" % (str(geneID[0]), str(geneID[1]))
            oGenes = self.returnOrthologs(geneID)
            # homologs should contain orthologs!
            hGenes = self.returnHomologs(geneID)
            if len(hGenes) < 1:
                continue

            seqcompWindows = {}
            genePairs = [[geneID, gene] for gene in hGenes]
            for pair in genePairs:
                if pair[1] in oGenes:
                    prog.setThreshold(orthologyThreshold)
                else:
                    prog.setThreshold(paralogyThreshold)

                fastaFile = self.createDataFile(geneIDList = pair)
                prog.inputFile(fastaFile)
                prog.run()
                seqcompWindows[pair[1]] = prog.getWindows()

            resultWindows = {}
            print "%s : %s %d" % (str(geneID), str(hGenes), len(seqcompWindows)) 
            for (seq1pos, seq2pos, matches, sense) in seqcompWindows[hGenes[0]]:
                resultWindows[seq1pos] = [(hGenes[0], seq2pos, sense, matches)]

            for gene in hGenes[1:]:
                newseqPositions = []
                for (seq1pos, seq2pos, matches, sense) in seqcompWindows[gene]:
                    if seq1pos not in resultWindows:
                        continue

                    newseqPositions.append(seq1pos)
                    resultWindows[seq1pos].append((gene, seq2pos, sense, matches))

                for pos in resultWindows.keys():
                    if pos not in newseqPositions:
                        del resultWindows[pos]

            for windowPos in resultWindows.keys():
                windowList = resultWindows[windowPos]
                criteria = "/%s:%s:%s:%s:%s" % (geneID[0], geneID[1], windowPos, "1", window)
                for (geneID2, seq2pos, sense, matches) in windowList:
                    criteria += "/%s:%s:%s:%s:%s" % (geneID2[0], geneID2[1], seq2pos, sense, matches)

                consList.append((homGroup, geneID, windowPos, window, tag, criteria))

                for windowEntry in windowList:
                    (gID, seq2pos, sense, matches) = windowEntry
                    consList.append((homGroup, gID, seq2pos, window, tag, criteria))

        self.insertConservedSequenceBatch(consList)


    def insertConservedSequence(self, datasetID, geneID, pos, length, method, criteria):
        """ insert an entry in conservedSequence.
        """
        values = "values(NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (self.conservationID, datasetID, geneID[0], geneID[1], pos, length, method, criteria)
        stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) %s" % values
        self.sqlcons(stmt, 'commit')

    
    def insertConservedSequenceBatch(self, batchlist):
        """ insert a list of entries in conservedSequence.
        """
        sqlList = []
        stmt = "INSERT INTO conservedSequence(ID, conservationID, datasetID, genome, locus, location, length, method, score) values(NULL, ?, ?, ?, ?, ?, ?, ?, ?)" 
        for (datasetID, geneID, pos, length, method, criteria) in batchlist:
            sqlList.append((str(self.conservationID), str(datasetID), str(geneID[0]), str(geneID[1]), pos, length, str(method), str(criteria)))

        self.batchsqlCons(stmt, sqlList)


    def getConservedSequenceWindows(self, geneID, datasetID=-1, method="", criteria=""):
        """ retrieve a list of conservation windows of the form (start, length) from 
            conservedSequence using datasetID and geneID, which match a particular method and criteria.
        """
        results = []
        stmt = "SELECT location, length, score from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
        if datasetID > 0:
            stmt += "and datasetID = '%d' " % (datasetID)

        if len(method) > 0:
            stmt += " and method = '%s' " % (method)

        if len(criteria) > 0:
            stmt += "and score = '%s' " % (criteria)

        res = self.sqlcons(stmt)
        for (location, length, criteria) in res:
            results.append((int(location), int(length), str(criteria)))

        return results


    def isConserved(self, geneID, position, length, datasetID=-1, method="", criteria=""):
        """ returns True if a particular sequence of position and length falls within a conservation
            window.
        """
        if len(position) == 2:
            position = position[0]
        stmt = "SELECT location, length from conservedSequence where genome = '%s' and locus = '%s'  " % (geneID[0], geneID[1])
        if datasetID > 0:
            stmt += "and datasetID = '%d' " % (datasetID)

        if len(method) > 0:
            stmt += " and method = '%s' " % (method)

        if len(criteria) > 0:
            stmt += "and score = '%s' " % (criteria)

        res = self.sqlcons(stmt)
        for (location, wlen) in res:
            if int(location) <= position and (int(location) + int(wlen)) >= (position + length):
                return True

        return False


    def maskNonConservedSequence(self, datasetID=-1, method="", criteria="", stripNLonger=21):
        """ masks any sequence in a dataset that was not highlighted as conserved by one or more conservation criteria. returns
            a dictionary that must be handled further. Will shrink sequences with stretches of Ns longer than stripNLonger.
        """
        maskedSeqDict = {}
        if datasetID < 0:
            datasetID = self.genepoolID

        settingsList =  self.getSettingsID(datasetID)
        geneIDList = eval(settingsList[1])
        for geneID in geneIDList:
            theseq = self.genepool[geneID]
            seqlen = len(theseq)
            maskedseq = ["N"] * seqlen
            conservedWindows = self.getConservedSequenceWindows(geneID, -1, method, criteria)
            for (start, length, crit) in conservedWindows:
                if start < 0:
                    start = 0

                if start + length >= seqlen:
                    length = seqlen - start - 1

                for index in range(start, start + length):
                    maskedseq[index] = theseq[index]

            tempseq = []
            numN = 0
            for letter in maskedseq:
                if letter.upper() == "N":
                    numN += 1
                else:
                    numN = 0

                if numN > stripNLonger:
                    continue

                tempseq.append(letter)

            maskedSeqDict[geneID] = string.join(tempseq, "")

        return maskedSeqDict


    def conservationStat(self, datasetID=-1, method="", criteria=""):
        """ report conservation level in each of the genes in the dataset.
        """
        nucleotides = ["A", "C", "G", "T", "a", "c", "g", "t"]
        totalcons = 0
        totalsize = 0
        consGeneDict = self.maskNonConservedSequence(datasetID, method, criteria)
        for geneID in consGeneDict:
            ntstat = 0
            for NT in nucleotides:
                ntstat += consGeneDict[geneID].count(NT)

            blocks = (len(consGeneDict[geneID]) - ntstat) / 21
            origSize = len(self.genepool[geneID])
            consPercentage = 100. * ntstat / float(origSize)
            print "%s %d out of %d bp in %d blocks ==> %3.2f percent of sequence conserved" % (str(geneID), ntstat, origSize, blocks, consPercentage)
            totalcons += ntstat
            totalsize += origSize

        consPercentage = 100. * totalcons / float(totalsize)
        print "total: %d out of %d bp ==> %3.2f percent of sequence conserved" % (totalcons, totalsize, consPercentage)


    def motifConservationStat(self, motifList=[]):
        """ check how many motifs instances for the motifs in the mapped motifs 
            (or only in motifList) are conserved.
        """
        if len(motifList) == 0:
            motifList = self.geneToMotifKeys(thekey="motif")

        for motID in motifList:
            index = 0
            matches = self.motifToGene(motID)
            for (loc, pos) in matches:
                mot = self.findMotif(motID)
                if self.isConserved(loc, pos, len(mot)):
                    index += 1

            print "%s\t%d out of %d conserved ==> %f pct " % (motID, index, len(matches), 100.0 * index / float(len(matches)))


    def checkForConservedSequence(self, datasetID=-1, method="", criteria=""):
        """ checks that at least two or more sequences in the dataset have conservation.
        """
        someHaveConservation = False
        numCons = 0
        checkDict = self.maskNonConservedSequence(datasetID, method, criteria)
        for entry in checkDict:
            theseq = checkDict[entry].upper()
            if "A" in theseq or "G" in theseq or "C" in theseq or "T" in theseq:
                numCons += 1

        if numCons > 1:
            someHaveConservation = True

        return someHaveConservation


    def exportConservedSequences(self, genomes=[], datasetID=-1, method="", criteria="", directory=".", prefix="cons"):
        """ exports conserved sequences to a fasta file.
        """
        (up, cds, down) = self.getSeqParameters()
        consDict = self.maskNonConservedSequence(datasetID, method, criteria, stripNLonger=100000000)
        if len(genomes) > 0:
            consDictEntries = consDict.keys()
            for entry in consDictEntries:
                if entry[0] not in genomes:
                    del consDict[entry]

        outfilename = "%s/%s.fsa" % (directory, prefix)
        consOutfile = open(outfilename, "w")
        for entry in consDict:
            entryCoordinates = cistematic.core.geneEntry(entry)
            entrySense = entryCoordinates[4]
            theseq = consDict[entry].upper()
            seqlen = len(theseq)
            (chrom, start, sense) = self.absoluteLocation((entry, (0, "F")), seqlen, (up, cds, down), entryCoordinates)
            if entrySense == "R":
                theseq = cistematic.core.complement(theseq)

            conservedBlockStart = -1
            for pos in range(seqlen):
                if theseq[pos] == "N":
                    if conservedBlockStart >= 0:
                        consStart = start + conservedBlockStart
                        consSeq = theseq[conservedBlockStart:pos]
                        consStop = consStart + len(consSeq) - 1
                        consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))
                        conservedBlockStart = -1

                    continue

                if conservedBlockStart < 0:
                    conservedBlockStart = pos

            if conservedBlockStart >= 0:
                consStart = start + conservedBlockStart
                consSeq = theseq[conservedBlockStart:pos]
                consStop = consStart + len(consSeq) - 1
                consOutfile.write(">%s_%s %s:%d-%d\n%s\n" % (entry[0], entry[1], chrom, consStart, consStop, consSeq))

        consOutfile.close()


    def createConservation(self, conservationID="default", dbName=""):
        """ creates the conservation SQL tables. Should only be run once per database file.
        """
        stmtList = []
        self.loadConservation(conservationID, dbName)
        try:
            stmtList.append("CREATE table homology(ID INTEGER PRIMARY KEY, conservationID varchar, homologyGroup varchar, orthologyGroup varchar, genome varchar, locus varchar, source varchar)")
            stmtList.append("CREATE table alignedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, sequence varchar)")
            stmtList.append("CREATE table conservedSequence(ID INTEGER PRIMARY KEY, conservationID varchar, datasetID varchar, genome varchar, locus varchar, location varchar, length varchar, method varchar, score varchar)")
            stmtList.append("CREATE index cons1 on homology(conservationID)")
            stmtList.append("CREATE index cons2 on alignedSequence(conservationID)")
            stmtList.append("CREATE index cons3 on conservedSequence(conservationID)")
            stmtList.append("CREATE index hom1 on homology(homologyGroup)")
            stmtList.append("CREATE index orth1 on homology(orthologyGroup)")
            stmtList.append("CREATE index homlocus1 on homology(genome, locus)")
            stmtList.append("CREATE index alignlocus2 on alignedSequence(genome, locus)")
            stmtList.append("CREATE index conslocus3 on conservedSequence(genome, locus, method)")
            for stmt in stmtList:
                self.sqlcons(stmt, commit=True)

            self.mlog("Created conservation tables in database %s" % dbName)
        except:
            self.mlog("Could not create conservation tables in database %s" % dbName)
            self.mlog("WARNING: perhaps you have called createConservation() twice on the same database?")


    def loadConservation(self, conservationID="default", dbName=""):
        """ changes the conservationID to use the specified one, or use default. Must be used before reading or
            writing any data to analysis tables.
        """
        self.conservationID = conservationID
        if len (dbName) > 0:
            self.consDBName = dbName
        else:
            self.consDBName = self.expFile


    def sqlcons(self, stmt, commit=""):
        """ executes a SQL statement and does a commit if commit is not-null. returns any results as a list.
        """
        res = []
        db = sqlite.connect(self.consDBName)
        sqlc = db.cursor()
        try:
            if self.debugSQL:
                print "Conservation->sqlcons: %s" % stmt

            sqlc.execute(stmt)
            res = sqlc.fetchall()
            try:
                if commit != "":
                    db.commit()
            except:
                self.mlog("Conservation->sqlcons (commit exception)")
        except:
            self.mlog("Conservation->sqlcons (statement exception): %s" % stmt)

        sqlc.close()
        db.close()

        return res


    def batchsqlCons(self, stmt, batch):
        """ executes a list of sql statements (usually inserts) stored in the list batch with a single commit.
        """
        res = []
        db = sqlite.connect(self.consDBName)
        sqlc = db.cursor()
        try:
            if self.debugSQL:
                print "batchsqlCons: %s" % stmt
                print "batchsqlCons: %s" % (str(batch))

            sqlc.executemany(stmt, batch)
            try:
                db.commit()
            except:
                self.mlog("Conservation->batchsqlCons (commit exception)")
        except:
            self.mlog("Conservation->batchsqlCons (statement exception): %s" % stmt)

        sqlc.close()
        db.close()

        return res