###########################################################################
#                                                                         #
# C O P Y R I G H T   N O T I C E                                         #
#  Copyright (c) 2003-10 by:                                              #
#    * California Institute of Technology                                 #
#                                                                         #
#    All Rights Reserved.                                                 #
#                                                                         #
# Permission is hereby granted, free of charge, to any person             #
# obtaining a copy of this software and associated documentation files    #
# (the "Software"), to deal in the Software without restriction,          #
# including without limitation the rights to use, copy, modify, merge,    #
# publish, distribute, sublicense, and/or sell copies of the Software,    #
# and to permit persons to whom the Software is furnished to do so,       #
# subject to the following conditions:                                    #
#                                                                         #
# The above copyright notice and this permission notice shall be          #
# included in all copies or substantial portions of the Software.         #
#                                                                         #
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,         #
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF      #
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                   #
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS     #
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN      #
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN       #
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE        #
# SOFTWARE.                                                               #
###########################################################################
#
try:
    from pysqlite2 import dbapi2 as sqlite
except:
    from sqlite3 import dbapi2 as sqlite

import string
from mmap import *
from os import environ

if environ.get("CISTEMATIC_ROOT"):
    cisRoot = environ.get("CISTEMATIC_ROOT") 
else:
    cisRoot = "/proj/genome"

__all__ = ["scerevisiae", "athaliana", "celegans", "cbriggsae", "cbrenneri", 
           "cremanei", "dmelanogaster", "mmusculus", "hsapiens", "rnorvegicus",
           "spurpuratus", "ggallus", "cfamiliaris", "mdomestica", "xtropicalis",
           "btaurus", "drerio", "ecaballus"
]

supportedGenomes = ["athaliana", "cfamiliaris", "mmusculus", "hsapiens",
                    "rnorvegicus", "ggallus", "scerevisiae", "celegans", "cbriggsae",
                    "cbrenneri", "cremanei", "dmelanogaster", "mdomestica",
                    "spurpuratus", "xtropicalis","btaurus", "drerio", "ecaballus"
]

geneDB  = {}
chromDict  = {}
chromRoot  = {}
chromGeneEntries = {}
checkGene = {}
geneInfo   = {}
getAllGenes = {}
annotInfo = {}
goInfo = {}
allAnnotInfo = {}
allGOInfo = {}

def compNT(nt):
    """ returns the complementary basepair to base nt
    """
    compDict = {"A": "T", "T": "A",
                "G": "C", "C": "G",
                "S": "S",
                "W": "W",
                "R": "Y", "Y": "R",
                "M": "K", "K": "M",
                "H": "D", "D": "H",
                "B": "V", "V": "B",
                "N": "N",
                "a": "t", "t": "a",
                "g": "c", "c": "g",
                "n": "n",
                "z": "z"
    }

    return compDict.get(nt, "N")


def complement(sequence, length=0):
    """ returns the complement of the sequence.
    """
    newSeq = ""
    seqLength = len(sequence)
    if length == seqLength:
        seqList = list(sequence)
        seqList.reverse()
        return "".join(map(compNT, seqList))

    for index in range(seqLength - 1, seqLength - length - 1, -1):
        try:
            newSeq += compNT(sequence[index])
        except:
            newSeq += "N"

    return newSeq


class Genome:
    genome = ""
    chromosome = ""
    version = ""
    dbFile = ""
    supported = False
    oldStyle = False
    customAnnotations = False


    def __init__(self, genome, chrom="", version="", dbFile="", inRAM=False):
        self.genome = genome
        if chrom != "":
            self.chromosome = chrom

        if version != "":
            self.version = version

        if dbFile != "":
            self.dbFile = dbFile

        if genome in supportedGenomes and dbFile == "":
            self.dbFile = geneDB[genome]
            self.supported = True

        if inRAM:
            self.memoryBacked = True
            self.memoryDB = sqlite.connect(":memory:")
            self.createGeneDB(inMemory=True)
            sql = self.memoryDB.cursor()
            try:
                sql.execute("PRAGMA DEFAULT_CACHE_SIZE = 500000")
                sql.execute("ATTACH '%s' as diskdb" % self.dbFile)
                for table in ["gene_entries", "gene_annotation", "gene_ontology", "sequences", "chromosomes", "sequence_features"]:
                    sql.execute("insert into %s select * from diskdb.%s" % (table, table))

                sql.execute("DETACH diskdb")
            except:
                if self.dbFile != "":
                    print "could not import %s" % self.dbFile

            sql.close()
            self.createIndices(inMemory=True)
        else:
            self.memoryBacked = False
            self.memoryDB = ""


    def setGeneDB(self, dbFile):
        self.dbFile = dbFile
        self.supported = False


    def setChromosome(self, chrom):
        self.chromosome = chrom


    def checkGene(self, geneID):
        """ returns True if the geneID matches an entry in the genome database.
        """
        (genome, gID) = geneID
        if genome != self.genome:
            return False

        try:
            stmt = "SELECT chromosome  from gene_entries where name = '%s' " % gID
            res = self.queryDB(stmt)
            if len(res) > 0:
                return True
        except:
            pass

        return False


    def geneInfo(self, geneID, version="1"):
        (genome, gID) = geneID
        result = ""
        if genome != self.genome:
            return False

        try:
            stmt = "SELECT chromosome, start, stop, length, orientation from gene_entries where name = '%s' and version = '%s' " % (gID, version)
            res = self.queryDB(stmt)
            if len(res) > 0:
                chrom = res[0]
                start = int(res[1])
                stop = int(res[2])
                if start > stop:
                    temp = stop
                    stop = start
                    start = temp

                length = int(res[3])
                orientation = res[4]
                result = (chrom, start, stop, length, orientation)
        except:
            pass

        return result


    def getallGeneInfo(self, name="", chrom="", version=""):
        resultDict = {}
        chromList = []
        stmt = "select name, chromosome, start, stop, orientation from gene_entries order by name, start "
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (name, chromosome, start, stop, orientation) = entry
            if name not in resultDict:
                resultDict[name] = []

            if chromosome not in chromList:
                resultDict[chromosome] = []
                chromList.append(chromosome)

            resultDict[chromosome].append((name, chromosome, int(start), int(stop), orientation))

        return resultDict


    def leftGeneDistance(self, geneID, radius=50000, version="1"):
        result = radius
        res = self.geneInfo(geneID, version)
        if res != "":
            (chrom, start, stop, length, orientation) = res
            if start > stop:
                temp = stop
                stop = start
                start = temp

            stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, start - radius, start, start - radius, start)
            res = self.queryDB(stmt, fetchall=True)
            for entry in res:
                rstart = int(entry[1])
                rstop = int(entry[2])
                if rstart > rstop:
                    temp = rstop
                    rstop = rstart
                    rstart = temp

                thelength = start - rstart
                if (start - rstop) > 0 and (start - rstop) < thelength:
                    thelength = start - rstop

                if thelength > 0 and thelength < result:
                    result = thelength

        return result

    def rightGeneDistance(self, geneID, radius=50000, version="1"):
        result = radius		
        res = self.geneInfo(geneID, version)
        if res != "":
            (chrom, start, stop, length, orientation) = res
            if start > stop:
                temp = stop
                stop = start
                start = temp

            stmt = "SELECT name, start, stop, length, orientation from gene_entries where chromosome = '%s' and ((start > %d and start < %d) or (stop > %d and stop < %d)) " % (chrom, stop, stop + radius, stop, stop + radius)
            res = self.queryDB(stmt, fetchall=True)
            for entry in res:
                rstart = int(entry[1])
                rstop = int(entry[2])
                if rstart > rstop:
                    temp = rstop
                    rstop = rstart
                    rstart = temp

                thelength = rstop - stop
                if (rstart - stop) > 0 and (rstart - stop) < thelength:
                    thelength = rstart - stop

                if thelength > 0 and thelength < result:
                    result = thelength

        return result


    def annotInfo(self, geneID):
        (genome, gID) = geneID
        result = []
        if genome != self.genome:
            return False

        stmt = "SELECT description from gene_annotation where name = '%s'" % gID
        res = self.queryDB(stmt, fetchall=True)
        if len(res) > 0:
            for entry in res:
                result.append(entry[0])

        return result


    def goInfo(self, geneID):
        (genome, gID) = geneID
        result = []
        if genome != self.genome:
            return False

        stmt = "SELECT GOID, objType, objName, isNot, GOterm, evidence from gene_ontology where name = '%s'" % gID
        res = self.queryDB(stmt, fetchall=True)
        if len(res) > 0:
            for entry in res:
                result.append(string.join(entry,"\t"))

        return result


    def chromInfo(self, chrom=""):
        if chrom == "" and self.chromosome != "":
            chrom = self.chromosome

        stmt = "SELECT sequenceName, storageType from chromosomes where name = '%s' " % chrom
        res = self.queryDB(stmt)
        result = "%s\t%s" % (res[0], res[1])

        return result


    def allGIDs(self):
        """ returns [ list of all orf names]
        """
        result = []
        stmt = "SELECT distinct name from gene_entries"
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            result.append(entry[0])

        return result


    def allGIDsbyGOID(self, GOID):
        """ returns [ list of all orf names] that match a particular GOID
        """
        result = []
        stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            result.append(entry[0])

        return result


    def allGOIDs(self):
        """ returns the list of GOID's in the genome
        """
        result = []
        stmt = "SELECT distinct GOID from gene_ontology"
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            result.append(entry[0])

        return result


    def allGOterms(self):
        """ returns the list of GOID's and their associated GO term in the genome
        """
        result = {}
        stmt = "SELECT distinct GOID, GOterm from gene_ontology"
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            result[str(entry[0])] = str(entry[1])

        return result


    def getGOIDCount(self, GOID):
        """ returns the match count for a particular GOID
        """
        stmt = "SELECT distinct name from gene_ontology where GOID = '%s' " % GOID
        res = self.queryDB(stmt, fetchall=True)

        return len(res)


    def allAnnotInfo(self):
        result = {}
        stmt = "SELECT name, description from gene_annotation"
        res = self.queryDB(stmt, fetchall=True)

        for entry in res:
            geneID = (self.genome, entry[0])
            if geneID not in result:
                result[geneID] = []

            result[(self.genome,entry[0])].append(entry[1])

        return result


    def allGoInfo(self):
        result = {}
        stmt = "SELECT name, GOID, objType, objName, isNot, GOterm, evidence, other from gene_ontology order by name " 
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            geneID = (self.genome, entry[0])
            if geneID not in result:
                result[geneID] = []

            result[(self.genome, entry[0])].append(string.join(entry[1:], "\t"))

        return result


    def allChromNames(self, partition=1, slice=0):
        result = []
        stmt = "SELECT distinct name from chromosomes"
        res = self.queryDB(stmt, fetchall=True)
        reslen = len(res)
        for index in range(reslen):
            if (index + slice) % partition == 0:
                entry = res[index]
                result.append(entry[0]) 

        return result


    def geneSeq(self, gID, maskCDS=False, maskLower=False, version="1"):
        (chrom, start, stop, length, orientation) = self.geneInfo(gID, version)
        seq = self.sequence(chrom, start, length, maskCDS, maskLower)
        if orientation == "R":
            seq = complement(seq, length)

        return seq


    def getGeneFeatures(self, gID, type="", version="1"):
        results = []
        featureClause = ""
        (genome, geneid) = gID
        if len(type) > 0:
            featureClause = ' and type = "%s" ' % type

        stmt = 'select type, chromosome, start, stop, orientation from sequence_features where name = "%s" and version = "%s"  %s order by start ' % (geneid, str(version), featureClause)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (type, chromosome, start, stop, orientation) = entry
            results.append((type, chromosome, start, stop, orientation))

        return results


    def getallGeneFeatures(self):
        resultDict = {}
        stmt = "select name, type, chromosome, start, stop, orientation from sequence_features order by name, start "
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (name, type, chromosome, start, stop, orientation) = entry
            if name not in resultDict:
                resultDict[name] = []

            resultDict[name].append((type, chromosome, start, stop, orientation))

        return resultDict


    def getFeatureTypes(self, ftype=''):
        """ Returns the distinct feature types available in the sequence_features
            tables. Can optionally limit by feature type; the wild-card % can be 
            used to search feature substrings.
        """
        results = []
        whereClause = ""
        useLike = False
        if "%" in ftype:
            useLike = True

        if len(ftype) > 0 and not useLike:
            whereClause = 'where type = "%s" ' % ftype
        elif len(ftype) > 0:
            whereClause = 'where type LIKE "%s" ' % ftype

        stmt = "select distinct type from sequence_features %s" % whereClause
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            results.append(entry[0])

        return results

    def getFeatures(self, ftype, name="", chrom="", version =""):
        """ Get features stored in sequence_features that match a feature type, 
            optionally restricted by name/value, chromosome, or version. Will 
            search for substrings when ftype and/or name are given with a % to 
            indicate the location of the wildcard. Returns a dictionary of features 
            with chromosomes as the keys.
        """
        results = {}
        chromList = []
        nameClause = ""
        chromClause = ""
        versionClause = ""
        useLike = "="
        if "%" in ftype:
            useLike = "LIKE"

        if len(name) > 0:
            if "%" in name:
                nameLike = "LIKE"
            else:
                nameLike = "="

            nameClause = ' and name %s "%s" ' % (nameLike, name)

        if len(chrom) > 0:
            chromClause = ' and chromosome = "%s" ' % chrom

        if len(version) > 0:
            versionClause = ' and version = "%s" ' % version

        stmt = 'select name, version, chromosome, start, stop, orientation, type from sequence_features where type %s "%s" %s %s %s order by type' % (useLike, ftype, chromClause, nameClause, versionClause)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (name, version, chromosome, start, stop, orientation, atype) = entry
            if chromosome not in chromList:
                results[chromosome] = []
                chromList.append(chromosome)

            results[chromosome].append((name, version, chromosome, start, stop, orientation, atype))

        return results


    def getFeaturesIntersecting(self, chrom, qstart, qlength, ftype=""):
        """ Return features that are on a particular stretch of the genome. Can optionally
            limit by feature type; the wild-card % can  be used to search feature substrings.
        """
        results = []
        featureClause = ""
        qstop = qstart + qlength
        useLike = False
        if "%" in ftype:
            useLike = True

        if len(ftype) > 0 and not useLike:
            featureClause = ' and type = "%s" ' % ftype
        elif len(ftype) > 0:
            featureClause = ' and type LIKE "%s" ' % ftype

        #select all features that encompass our start/stop
        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start < %d and stop > %d %s order by start' % (chrom, qstart, qstop, featureClause)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version, atype) = entry
            results.append((name, version, chromosome, start, stop, orientation, atype))

        # select all features that have a "stop" between start and stop that aren't yet in the dataset
        stmt = 'select chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version, atype) = entry
            if (name, version, chromosome, start, stop, orientation, atype) not in results:
                results.append((name, version, chromosome, start, stop, orientation, atype))

        # select all features on chromosome that have a "start" between start and stop
        stmt = 'chromosome, start, stop, orientation, name, version, type from sequence_features where chromosome = "%s" and start >= %d and start <= %d %s order by start' % (chrom, qstart, qstop, featureClause)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version, atype) = entry
            if (name, version, chromosome, start, stop, orientation, atype) not in results:
                results.append((name, version, chromosome, start, stop, orientation, atype))

        return results


    def getGenesIntersecting(self, chrom, qstart, qlength):
        """ Return features that are on a ptarticular stretch of the genome. Can optionally
            limit by feature type; the wild-card % can  be used to search feature substrings.
        """
        results = []
        qstop = qstart + qlength
        atype = "model"
        #select all features that encompass our start/stop
        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start < %d and stop > %d order by start' % (chrom, qstart, qstop)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version) = entry
            results.append((name, version, chromosome, start, stop, orientation, atype))

        # select all features that have a "stop" between start and stop that aren't yet in the dataset
        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and stop >= %d and stop <= %d order by start' % (chrom, qstart, qstop)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version) = entry
            if (name, version, chromosome, start, stop, orientation) not in results:
                results.append((name, version, chromosome, start, stop, orientation, atype))

        # select all features on chromosome that have a "start" between start and stop
        stmt = 'select chromosome, start, stop, orientation, name, version from sequence_features where chromosome = "%s" and start >= %d and start <= %d order by start' % (chrom, qstart, qstop)
        res = self.queryDB(stmt, fetchall=True)
        for entry in res:
            (chromosome, start, stop, orientation, name, version) = entry
            if (name, version, chromosome, start, stop, orientation) not in results:
                results.append((name, version, chromosome, start, stop, orientation, atype))

        return results


    def sequence(self, chrom, start, length, maskCDS=False, maskLower=False):
        seq = ""
        if chrom == "" and self.chromosome != "":
            chrom = self.chromosome

        stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
        res = self.queryDB(stmt)
        seqName = res[0]
        seqType = res[1]
        if seqType == "db":
            stmt = "select sequence from sequences where name = '%s' " % seqName
            res = self.queryDB(stmt)
            seq = res[0][start: start + length]
            res = ""
        else:
            chromFile = open("%s%s" % (cisRoot, seqName), "r")
            mymap = mmap(chromFile.fileno(),1,access=ACCESS_READ)
            mymap = mmap(chromFile.fileno(), mymap.size(), access=ACCESS_READ)
            mymap.seek(start)
            seq = mymap.read(length)
            chromFile.close()

        if maskCDS:
            stop = start + length - 1
            seqArray = list(seq)
            features = self.getFeaturesIntersecting(chrom, start, length, "CDS")
            for entry in features:
                (name, geneID, chromosome, fstart, fstop, forientation, type) = entry
                if fstart < start:
                    fstart = start

                if fstop > stop:
                    fstop = stop

                nstart = fstart - start
                nstop = fstop - start + 1
                for pos in range(nstop - nstart):
                    seqArray[nstart + pos] = "N"

            seq = string.join(seqArray, "")

        if maskLower:
            seqArray = list(seq)
            for index in range(len(seqArray)):
                if seqArray[index] in ["a", "c" , "g", "t"]:
                    seqArray[index] = "N"

            seq = string.join(seqArray, "")

        return seq


    def getChromosomeSequence(self, chrom=""):
        seq = ""
        if chrom == "" and self.chromosome != "":
            chrom = self.chromosome

        stmt = "select sequenceName, storageType from chromosomes where name = '%s' " % chrom
        res = self.queryDB(stmt)
        if res == None:
            print "Could not find chromosome %s" % chrom
            return ''

        seqName = res[0]
        seqType = res[1]
        if seqType == "db":
            res = self.queryDB('select sequence from sequences where name = "%s"' % chrom)
            seq = res[0]
            res = ""
        else:
            chromFile = open("%s%s" % (cisRoot, seqName), "r")
            seq = chromFile.readline()
            chromFile.close()

        return seq


    def chromGeneEntries(self, chrom="", lowerbound=-1, upperbound=-1):
        result = []
        if chrom == "" and self.chromosome != "":
            chrom = self.chromosome

        stmt = "select distinct start, stop, orientation, name from gene_entries where chromosome = '%s' " % chrom
        res = self.queryDB(stmt, fetchall=True)
        if lowerbound > 0 and upperbound > 0:
            for entry in res:
                start = int(entry[0])
                stop = int(entry[1])
                if stop < start:
                    start, stop = stop, start

                if stop < lowerbound or start > upperbound:
                    continue

                result.append((start, stop, entry[2], (self.genome, entry[3])))
        else:
            for entry in res:
                start = int(entry[0])
                stop = int(entry[1])
                if stop < start:
                    start, stop = stop, start

                result.append((start, stop, entry[2], (self.genome, entry[3])))

        return result


    def createGeneDB(self, dbFile="", inMemory=False):
        if len(dbFile) > 0:
            self.dbFile = dbFile

        tableDict = {"gene_entries": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start varchar, stop varchar, length varchar, orientation varchar, feature varchar",
                     "gene_annotation": "ID INTEGER PRIMARY KEY, name varchar, description varchar",
                     "gene_ontology": "ID INTEGER PRIMARY KEY, name varchar, GOID varchar, objType varchar, objName varchar, isNot varchar, GOterm varchar, evidence varchar, other varchar",
                     "sequences": "ID INTEGER PRIMARY KEY, name varchar, sequenceLength varchar, sequenceType varchar, sequence varchar",
                     "chromosomes": "ID INTEGER PRIMARY KEY, name varchar, sequenceName varchar, storageType varchar",
                     "sequence_features": "ID INTEGER PRIMARY KEY, name varchar, version varchar, chromosome varchar, start int, stop int, length varchar, orientation varchar, type varchar"
        }
        
        for table in tableDict.keys():
            stmt = "create table %s(%s)" % (table, tableDict[table])
            self.writeDB(stmt, useMemory=inMemory)


    def createIndices(self, inMemory=False):
        indexDict = {"nameIndex1": ("gene_entries", "name"),
                     "nameIndex2": ("gene_annotation", "name"),
                     "nameIndex3": ("gene_ontology", "name"),
                     "goidIndex": ("gene_ontology", "GOID"),
                     "nameIndex4": ("sequences", "name"),
                     "nameIndex5": ("chromosomes", "name"),
                     "geneIDIndex": ("sequence_features", "name, type"),
                     "posIndex": ("sequence_features", "chromosome, start, stop, type"),
                     "typeIndex": ("sequence_features", "type")
        }

        for indexName in indexDict.keys():
            (tableName, fields) = indexDict[indexName]
            stmt = "CREATE INDEX %s on %s(%s)" % (indexName, tableName, fields)
            self.writeDB(stmt, useMemory=inMemory)


    def addGeneEntry(self, geneID, chrom, start, stop, orientation, feature="", version=1.0):
        (genome, gID) = geneID
        length = str(abs(int(start) - int(stop)) + 1)
        stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(version), chrom, start, stop, length, orientation, feature)
        self.writeDB(stmt)


    def addFeatureEntry(self, name, geneID, chrom, start, stop, orientation, type=""):
        (genome, gID) = geneID
        length = str(abs(int(start) - int(stop)) + 1)
        stmt = "insert into sequence_features(ID, name, geneID, chromosome, start, stop, length, orientation, type) values (NULL, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % (name, gID, chrom, start, stop, length, orientation, type)
        self.writeDB(stmt)


    def addAnnotation(self, geneID, description):
        (genome, gID) = geneID
        stmt = "insert into gene_annotation(ID, name, description) values (NULL, '%s', '%s') " % (gID, description)
        self.writeDB(stmt)


    def addGoInfo(self, geneID, GOID, objType="", objName="", isNot="", GOterm="", evidence="", other=""):
        (genome, gID) = geneID
        stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, '%s', '%s',  '%s', '%s', '%s', '%s', '%s', '%s') " % (gID, str(GOID), objType, objName, isNot, GOterm, evidence, other)
        self.writeDB(stmt)


    def addSequence(self, geneID, seq, seqType, seqLen="-1"):
        (genome, gID) = geneID
        if int(seqLen) < 0:
            seqLen = str(len(seq))

        stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, '%s', '%s', '%s', '%s')" % (gID, str(seqLen), seqType, seq)
        self.writeDB(stmt)


    def addChromosomeEntry(self, chromo, seqName, storageType):
        stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, '%s', '%s', '%s')" % (chromo, seqName, storageType)
        self.writeDB(stmt)


    def addSequenceBatch(self, entryArray, inMemory=False):
        stmtArray = []
        stmt = "insert into sequences(ID, name, sequenceLength, sequenceType, sequence) values (NULL, ?, ?, ?, ?)" 
        for entry in entryArray:
            (geneID, seq, seqType, seqLen) = entry
            (genome, gID) = geneID
            if int(seqLen) < 0:
                seqLen = str(len(seq))

            stmtArray.append((gID, str(seqLen), seqType, seq))

        self.writeBatchDB(stmt, stmtArray)


    def addChromosomeEntryBatch(self, entryArray, inMemory=False):
        stmt = "insert into chromosomes(ID, name, sequenceName, storageType)  values (NULL, ?, ?, ?)" 
        self.writeBatchDB(stmt, entryArray, useMemory=inMemory)


    def addGeneEntryBatch(self, entryArray, inMemory=False):
        stmtArray = []
        stmt = "insert into gene_entries(ID, name, version, chromosome, start, stop, length, orientation, feature) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) "
        for entry in entryArray:
            (geneID, chrom, start, stop, orientation, feature, version) = entry
            (genome, gID) = geneID
            length = str(abs(int(start) - int(stop)) + 1)
            stmtArray.append((gID, str(version), chrom, start, stop, length, orientation, feature))

        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)


    def addFeatureEntryBatch(self, entryArray, inMemory=False):
        stmtArray = []
        stmt = "insert into sequence_features(ID, name, version, chromosome, start, stop, length, orientation, type) values (NULL, ?, ?, ?, ?, ?, ?, ?, ?) " 
        for entry in entryArray:
            (geneID, version, chrom, start, stop, orientation, type) = entry
            (genome, gID) = geneID
            length = str(abs(int(start) - int(stop)) + 1)
            stmtArray.append((gID, version, chrom, int(start), int(stop), length, orientation, type))

        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)


    def addAnnotationBatch(self, entryArray, inMemory=False):
        stmtArray = []
        stmt = "insert into gene_annotation(ID, name, description) values (NULL, ?, ?) " 
        for entry in entryArray:
            (geneID, description) = entry
            (genome, gID) = geneID
            stmtArray.append((gID, description))

        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)


    def addGoInfoBatch(self, entryArray, inMemory=False):
        stmtArray = []
        stmt = "insert into gene_ontology(ID, name, GOID, objType, objName, isNot, GOterm, evidence, other) values (NULL, ?, ?,  ?, ?, ?, ?, ?, ?) " 
        for entry in entryArray:
            (geneID, GOID, objType, objName, isNot, GOterm, evidence, other) = entry
            (genome, gID) = geneID
            stmtArray.append((gID, str(GOID), objType, objName, isNot, GOterm, evidence, other))

        self.writeBatchDB(stmt, stmtArray, useMemory=inMemory)


    def extendFeatures(self, featureFile, fileType="cistematic", replace=False):
        geneEntryList = []
        geneFeatureList = []
        currentGene = ""
        gstart = -1
        gstop = -1
        gchrom = ""
        gsense = ""
        ftype = ""
        senseArray = {"+": "F",
                      "-": "R",
                      ".": "F"
        }

        featfile = open(featureFile)
        if fileType == "cistematic":
            for line in featfile:
                if line[0] == "#":
                    continue

                fields = line.strip().split()
                if currentGene == "":
                    currentGene = fields[0]
                    gchrom = fields[2]
                    gstart = int(fields[3])
                    gsense = senseArray[fields[5]]

                if fields[0] != currentGene:
                    geneEntryList.append(((self.genome, currentGene), gchrom, gstart, gstop, gsense, "Transcript", "1"))
                    currentGene = fields[0]
                    gchrom = fields[2]
                    gstart = int(fields[3])
                    gsense = senseArray[fields[5]]

                lstart = int(fields[3])
                gstop = int(fields[4])
                ftype = fields[6]
                geneFeatureList.append(((self.genome, currentGene), "1", gchrom, lstart, gstop, gsense, ftype))
        elif fileType == "UCSC":
            for line in featfile:
                if line[0] == "#":
                    continue

                geneFields = line.split("\t")
                exonNum = int(geneFields[8])
                exonStarts = geneFields[9].split(",")
                exonStops = geneFields[10].split(",")
                gchrom = geneFields[2][3:]
                gsense = senseArray[geneFields[3]]
                gstop = int(geneFields[7]) - 1
                gstart = int(geneFields[6]) - 1
                geneID = ("generic", geneFields[1])
                for index in range(exonNum):
                    estart = int(exonStarts[index]) - 1
                    estop = int(exonStops[index]) - 1
                    if estart >= gstart and estop <= gstop:
                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, "CDS"))
                    elif estop <= gstart:
                        if gsense == "F":
                            fType = "5UTR"
                        else:
                            fType = "3UTR"

                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
                    elif estart >= gstop:
                        if gsense == "F":
                            fType = "3UTR"
                        else:
                            fType = "5UTR"

                        geneFeatureList.append((geneID, "1", gchrom, estart, estop, gsense, fType))
                    elif estart <= gstop and estart > gstart:
                        if gsense == 'F':
                            fType = '3UTR'
                        else:
                            fType = '5UTR'

                        geneFeatureList.append((geneID, "1", gchrom, estart, gstop, gsense, "CDS"))
                        geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop, gsense, fType))
                    elif estart < gstart and estop <= gstop:
                        if gsense == "F":
                            fType = "5UTR"
                        else:
                            fType = "3UTR"

                        geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType))
                        geneFeatureList.append((geneID, "1", gchrom, gstart, estop, gsense, "CDS"))
                    else:
                        if gsense == "F":
                            fType1 = "5UTR"
                            fType2 = "3UTR"
                        else:
                            fType1 = "3UTR"
                            fType2 = "5UTR"

                        geneFeatureList.append((geneID, "1", gchrom, estart, gstart - 1, gsense, fType1))
                        geneFeatureList.append((geneID, "1", gchrom, gstart, gstop, gsense, "CDS"))
                        geneFeatureList.append((geneID, "1", gchrom, gstop + 1, estop - 1, gsense, fType2))
        else:
            print "%s format not supported yet"
            featfile.close()
            return

        featfile.close()
        if replace==True:
            self.queryDB("delete from sequence_features", useMemory=True)
            self.queryDB("delete from gene_entries", useMemory=True)        

        self.addGeneEntryBatch(geneEntryList, inMemory=True)
        self.addFeatureEntryBatch(geneFeatureList, inMemory=True)


    def queryDB(self, stmt, fetchall=False, useMemory=False):
        if useMemory or self.memoryBacked:
            db = self.memoryDB
        else:
            db = sqlite.connect(self.dbFile, timeout = 60)

        sql = db.cursor()
        sql.execute(stmt)
        if fetchall:
            results = sql.fetchall()
        else:
            results = sql.fetchone()

        sql.close()
        if not (useMemory or self.memoryBacked):
            db.close()		

        return results


    def writeDB(self, stmt, useMemory=False):
        if useMemory:
            db = self.memoryDB
        else:
            db = sqlite.connect(self.dbFile, timeout = 60)

        sql = db.cursor()
        sql.execute(stmt)
        db.commit()
        sql.close()
        if not useMemory:
            db.close()


    def writeBatchDB(self, stmt, stmtArray, useMemory=False):
        if useMemory:
            db = self.memoryDB
        else:
            db = sqlite.connect(self.dbFile, timeout = 60)

        sql = db.cursor()
        try:
            sql.executemany(stmt, stmtArray)
        except:
            print "writeBatchDB: problem with %s" % stmt

        db.commit()
        sql.close()
        if not useMemory:
            db.close()


def processSql(sqlFile, keyFields={}):
    """ process a UCSC formatted .sql file to identify the table name and the position of key fields. Specifying the 
        name of important fields in keyFields is highly recommended. A key in keyFields represents the sql column 
        name in the file, while the corresponding value corresponds to the feature name. Blank values mean that the 
        same field name will be used. For example, one possible keyFields dict would be:
        keyFields = {'chromStart':'start', 'chromEnd':'stop', 'chrom':'', 'name':'', 'score':'value', 'strand':'orientation'}
    """
    fields = {}
    infile = open(sqlFile)
    line = ""
    while "CREATE TABLE" not in line:
        line = infile.readline()
        continue

    tabFields = line.split()
    tabName = tabFields[2]
    if keyFields == {}:
        fields["chrom"] = 1
        fields["start"] = 2
        fields["stop"] = 3
        fields["name"] = 4
    else:
        index = 0
        for line in infile:
            lineFields = line.split()
            if lineFields[0] in keyFields.keys():
                if keyFields[lineFields[0]] == "":
                    outfield = lineFields[0]
                else:
                    outfield = keyFields[lineFields[0]]

                fields[outfield] = index

            index += 1

    infile.close()

    return (tabName, fields)


def processTrack(genomeObject, dataFile, typeName="MISC", dataFields={}, version="1"):
    """ process data for a UCSC track, given an instantiated genome object, the data, the name for the feature 
        type in sequence_features, the dataFields in the format returned by processSql(), and a version.
        If genomeObject is the empty string '', then processTrack() will simply print out the aggregate records,
        rather than use addFeatureEntryBatch() to record the added features.

        Note that numberic values are overloaded into the name field using @ as a delimiter.
    """
    records = []
    if dataFields == {}:
        dataFields["chrom"] = 1
        dataFields["start"] = 2
        dataFields["stop"] = 3
        dataFields["name"] = 4
    
    infile = open(dataFile)
    for line in infile:
        fields = line.strip().split("\t")
        if "name" in dataFields:
            recName = fields[dataFields["name"]]
        else:
            recName = typeName

        if "value" in dataFields:
            recName += "@" + fields[dataFields["value"]]

        start = int(fields[dataFields["start"]])
        stop = int(fields[dataFields["stop"]])
        chrom = fields[dataFields["chrom"]][3:]
        chrom.replace("_random", "rand")
        orientation = "F"
        if "orientation" in dataFields:
            if fields[dataFields["orientation"]] == "-":
                orientation = "R"

        if "version" in dataFields:
            version = fields[dataFields["version"]]

        records.append((("placeholder", recName), version, chrom, start, stop, orientation, typeName.upper()))

    if genomeObject != "":
        genomeObject.addFeatureEntryBatch(records)
    else:
        print records


# Voodoo to get recursive imports working
for genome in supportedGenomes:
    importString = "import %s" % genome
    exec importString
    geneDB[genome]  = eval("%s.geneDB" % genome)