import sys
import optparse
import string
try:
    import psyco
    psyco.full()
except:
    print "psyco not running"

from cistematic.genomes import Genome
from commoncode import getConfigParser, getConfigIntOption


def main(argv=None):
    if not argv:
        argv = sys.argv

    verstring = "getsplicefa: version 1.1"
    print verstring
    delimiter = "|"

    usage = "usage: python getsplicefa.py genome ucscModels outfilename maxBorder [--verbose] [--spacer num]\
            \n\twhere spacer is by default 2, and maxBorder should be readlen - (2 * spacer)\
            \n\tdelimiter is set to %s - edit the code to change it, if necessary\n" % delimiter

    parser = makeParser(usage)
    parser.set_defaults(doVerbose=False, spacer=2)
    (options, args) = parser.parse_args(argv[1:])

    try:
        genome = args[0]
        datafilename = args[1]
        outfilename = args[2]
        maxBorder = int(args[3])
    except IndexError:
        print usage
        sys.exit(1)

    getSpliceFasta(genome, datafilename, outfilename, maxBorder, options.doVerbose, options.spacer, delimiter)


def makeParser(usage=""):
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("--verbose", action="store_true", dest="doVerbose",
                      help="show verbose messages [default: False]")
    parser.add_option("--spacer", type="int", dest="spacer",
                      help="number of spacer NTs to use [default: 2")

    configParser = getConfigParser()
    section = "getsplicefa"
    doVerbose = getConfigIntOption(configParser, section, "doVerbose", False)
    spacer = getConfigIntOption(configParser, section, "spacer", 2)

    parser.set_defaults(doVerbose=doVerbose, spacer=spacer)

    return parser


def getSpliceFasta(genome, datafilename, outfilename, maxBorder, doVerbose=False, spacer=2, delimiter="|"):
    spacerseq = "N" * spacer

    datafile = open(datafilename)
    hg = Genome(genome)

    spliceCountDict = {}
    exonStartDict = {}
    exonStopDict = {}
    exonLengthDict = {}
    nameToChromDict = {}
    nameToComplementDict = {}
    alreadySeen = {}
    counter = 0

    for line in datafile:
        fields = line.split()
        name = fields[0]
        spliceCount = int(fields[7]) - 1
        if spliceCount < 1:
            continue

        counter += spliceCount
        spliceCountDict[name] = spliceCount
        chrom = fields[1][3:]
        if chrom == "chrM":
            continue

        nameToChromDict[name] = chrom
        if chrom not in alreadySeen:
            alreadySeen[chrom] = []

        nameToComplementDict[name] = fields[2]
        exonStarts = []
        exonStops = []
        for val in fields[8].split(",")[:-1]:
            exonStarts.append(int(val))

        for val in fields[9].split(",")[:-1]:
            exonStops.append(int(val))

        exonStartDict[name] = exonStarts
        exonStopDict[name] = exonStops
        exonLengths = []
        for index in range(spliceCount + 1):
            exonLengths.append(exonStops[index] - exonStarts[index] + 1)

        exonLengthDict[name] = exonLengths

    print len(spliceCountDict)
    print counter

    missedCount = 0
    depressedCount = 0
    splicefileindex = 1
    spliceCounter = 0
    outfile = open(outfilename, "w")
    for name in nameToChromDict:
        try:
            spliceCount = spliceCountDict[name]
        except:
            continue

        exonStarts = exonStartDict[name]
        exonStops = exonStopDict[name]
        exonLengths = exonLengthDict[name]
        chrom = nameToChromDict[name]
        for index in range(spliceCount):
            if (exonStops[index], exonStarts[index + 1]) in alreadySeen[chrom]:
                continue

            regionstart = exonStops[index] - maxBorder
            alreadySeen[chrom].append((exonStops[index], exonStarts[index + 1]))
            beforeLen = exonLengths[index]
            afterLen = exonLengths[index + 1]
            if (beforeLen + afterLen) < maxBorder + spacer:
                missedCount += 1
                continue

            if (beforeLen + afterLen) < 2 * maxBorder:
                depressedCount += 1

            if beforeLen > maxBorder:
                beforeLen = maxBorder

            if afterLen > maxBorder:
                afterLen = maxBorder

            try:
                beforeSplice = hg.sequence(chrom, exonStops[index] - maxBorder, maxBorder)
                afterSplice = hg.sequence(chrom, exonStarts[index + 1], maxBorder)
            except:
                if doVerbose:
                    print "could not get chr%s:%d-%d" % (chrom, exonStops[index], exonStarts[index + 1])
                continue

            sequenceHeader = string.join([name, delimiter, str(index), delimiter, str(regionstart)], "")
            spliceJunctionSequence = string.join([spacerseq, beforeSplice.upper(), afterSplice.upper(), spacerseq], "")
            outstring = ">%s\n%s\n" % (sequenceHeader, spliceJunctionSequence)
            outfile.write(outstring)

        splicefileindex += 1
        spliceCounter += 1
        if spliceCounter > 10000:
            print "%d genes" % splicefileindex
            spliceCounter = 0

    outfile.close()

    print "%d splices too short to be seen" % missedCount
    print "%d splices will be under-reported" % depressedCount

if __name__ == "__main__":
    main(sys.argv)