##Sarah Aerni
##Created:  July 12, 2005
##Modified: June 22, 2006
##Modified: Oct  22, 2008 by Ali (Minor modifications to outputs)
##This is a greedy motif finding program.
##Using background models it determines which motifs are overrepresented
###############################################################################
##include a way to go back and
##uses the new way of doing background and the new scoring rather than simple
##consensus it uses the log likelihood
###############################################################################

import hotshot
import random
import math
import sys
import copy
import time

from math import log
from math import e
from math import ceil
from random import shuffle
from random import randint

ConsensusScore= {True:2, False:1}

NTDIndices = {'A':0, 'C':1, 'G':2, 'T':3}
IndexNTDs  = {0:'A', 1:'C', 2:'G', 3:'T'}
INSERTION_N = "N"

NTDA=0
NTDC=1
NTDG=2
NTDT=3

INSERTION_MARKER="S"
INSERTION_FILLER="N"

#markov size - 1 = markov model
#example:
#if you want a markov 1 you want to check 2 ntds total
#markov_size = 2
#declare global variables
global MARKOV_WINDOW
global MW1
global percID
global Founders
global UseRC
global Frequency
global NumOfMismatch
Frequency={}
global sizeOfMotif
global sequences
global founderPercID
#MotifFinder with 3 arguments
#input:     sequences       sequences in which we are to find a motif
#           sizeOfMotif     size of the motif to be identified in the sequences
#           numberOfMotifs  the number of motifs of the indicated size to be
#                           identified in the sequences
#           Iterations      The number of iterations that will be perfdormed in
#                           order to determine an optimal motif
#           Frequency       Markov Model for background
#           UseRC           a boolean indicating whether the reverse complement 
#                           should be used i the finding of a motif
#                           (True = yes)
#           Founders        a boolean indicating whether the first 2 sequences
#                           should always be the founder sequences
#           percID          percent of the aligned sequences that must be 
#                           identical in order to be considered for a motif to
#                           be considered (between the founder sequences then
#                           between founder sequence consensus and each other
#                           subsequence being considered)
#           founderPercID   The minimum percent identity two aligned sequences
#                           must be to qualify as founder sequences
#           MARKOV_SIZE     Size of the markov model (corresponds to length of
#                           the words in the dictionary Frequency)
#output:    (List1, List2)  List1 contains PSFMs of the indicated length and up
#                           to the indicated number (there may be fewer) in the
#                           standard cistematic format, while List2 holds the 
#                           corresponding sequences.
#
#This function will call the overloaded MotifFinder function, passing in a 
#default amount for the number of iterations (currently set at 100)
def MotifFinder(sequences, minSize, maxSize, numberOfMotifs, Iterations,
                Frequency, UseRC, Founders, percID, founderPercID,MARKOV_SIZE,
                model,masking):

    #declare global variables
    global MARKOV_WINDOW
    global MW1
    global NumOfMismatch
    global sizeOfMotif
    numOfSequences=len(sequences)

    #the minimum window size permitted by this program is a motif of length 6
    if minSize < 6:
        minSize = 6

    #sanity check for reasonable motif sizes
    if minSize > maxSize:
        print "Please input a minimum size that is smaller than a maximum size"
        print "User input:\nMinimum Size:\t%i\nMaximumSize:\t%i" % (minSize, maxSize)
        return None

    #sanity check for reasonable percentID
    if percID> founderPercID:
        print "Your input %i for percent identity for founder sequences" % founderPercID,
        print "is less than your input for general percent identity at",percID
        print "Please adjust your input"
        return None

    #a user may input a number of iterations which exceeds the possible
    #for a given set of input values, in which case the number of iterations
    #should be decreased to another size
    maxIterations = (maxSize - minSize+1)*Choose(len(sequences),2)*Factorial(len(sequences)-2)
    if maxIterations < Iterations:
        print "The number of Iterations you entered", Iterations,
        print "is being reduced to",maxIterations,
        Iterations = maxIterations

    #adjusting the window as remarked above
    MARKOV_WINDOW = MARKOV_SIZE + 1
    MW1=MARKOV_SIZE
    print len(sequences), "sequences search for",numberOfMotifs,
    print " motifs of size",minSize,"to",maxSize
    print "The percent Identity input is",percID,"%"

    #this list will contain motifs found (up to the NumberOfMotifs passed in
    sequencesToCompare = [i.upper() for i in sequences]
    #as a preprocessing steps, a string of Ns will be replaced by one N
    for i in xrange(len(sequences)):
        splitByN=sequencesToCompare[i].split('N')
        j = 0;
        finalSequence=""
        max_j=len(splitByN)
        while j  < max_j :
            if len(splitByN[j])==0:
                finalSequence="".join([finalSequence,'N'])
                while len(splitByN[j])==0:
                    j+=1
                    if j==max_j:
                        break
            else:
                finalSequence="".join([finalSequence,splitByN[j]])
                j+=1
        sequencesToCompare[i]=finalSequence
    originalSeqs = []
    aveCalcSeqs = []

    if UseRC:
        for seqIndex in xrange(len(sequencesToCompare)):
            sequence = revComp(sequencesToCompare[seqIndex])
            aveCalcSeqs.append(''.join([sequencesToCompare[seqIndex], sequence]))
            originalSeqs.append(''.join([sequencesToCompare[seqIndex],INSERTION_N,sequence]))
            sequencesToCompare[seqIndex] = ''.join([sequencesToCompare[seqIndex],INSERTION_N*(MARKOV_WINDOW),sequence[MW1:]])
            sequence = ''.join([INSERTION_N*(MW1),sequencesToCompare[seqIndex][MW1:]])
            sequencesToCompare[seqIndex] = sequence

    AllPSFMS = []
    AllPSFMseqs = [] 
    for motif_i in xrange(numberOfMotifs):
        empty=min([len(sequencesToCompare[i]) for i in xrange(len(sequencesToCompare))])
        if empty < maxSize:
            print "A sequence has been depleted of all options"	   
            return (AllPSFMS, AllPSFMseqs)

        OverallBest = "Infinity"
        print "MOTIF NUMBER %i\n" % motif_i
        #multiple iterations are attempted to avoid local maxima
        for step in xrange(Iterations):
            sys.stdout.flush() 
            numIncluded = 2 #used in order to create PSFMs from PWMs
            #CompareList will be used to randomly generate the sequences to be
            #compared
            CompareList = range(len(sequencesToCompare))
            BestScore = "Infinity"
            #find the best shared motif between the first two sequences
            #the motif must be completely contained within the sequence
            #or its reverse complement
            NumberOfTries = 0
            while (BestScore == "Infinity"):
                #PWM created from all sequences included in the motif
                PWM = []
                #pick motif size randomly within the range provided by the user
                sizeOfMotif = random.randint(minSize,maxSize)
                NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
                #if the motif average is not yet established create the average
                #and store tis value in the dictionary

                #best motif startpoints will be stored in this list of start
                #locations
                BestMotif = [-1] * len(sequencesToCompare)
                #these are start locations are used in order to test new best
                #start indices
                startLocs = BestMotif[:]
                #if it appears there are no more motifs possible quit       
                if (NumberOfTries > Choose(len(sequencesToCompare), 2)):
                    print "No more motifs to be found after %i tries" % NumberOfTries
                    return (AllPSFMS, AllPSFMseqs)
                NumberOfTries +=1
                #randomize sequence order
                #if the sequence founders are input then you only want to
                #randomize the remaining sequences
                if Founders:
                    CompareList = [0,1]
                    remaining = range(2,len(sequencesToCompare))
                    shuffle(remaining)
                    CompareList.extend(remaining)
                #if no founders are specified, randomize all
                else:
                    shuffle(CompareList)

                SeqIndex0 = CompareList[0]
                SeqIndex1 = CompareList[1]
                #founder sequences - first 2 sequences - are selected 
                #create random start sites to avoid favoring the 5' region
                startLocs[SeqIndex0] = -1
                startLocs[SeqIndex1] = -1
                BestMotif[SeqIndex0] = -1
                BestMotif[SeqIndex1] = -1
                BestScore = "Infinity"            
                #define the best motif between the first two sequences the 
                #inital set of comparisons are performed at the same indices in
                #both "founder" sequences

                #to ensure that the startpoint is random we will rearrange the
                # sequences
                start0 = randint(0,len(sequencesToCompare[SeqIndex0])-sizeOfMotif-1)
                accessSeq=sequencesToCompare[SeqIndex0]
                seq0 = ''.join([accessSeq[start0+1:],'N',accessSeq[:start0+sizeOfMotif]])
                start1 = randint(0,len(sequencesToCompare[SeqIndex1])-sizeOfMotif-1)
                accessSeq=sequencesToCompare[SeqIndex1]
                seq1 = ''.join([accessSeq[start1+1:],'N',accessSeq[:start1+sizeOfMotif]])

                #create shifted score, then check to see if it meets all
                #criteria to become the best.

                #Perform a time efficient alignment of the founder sequences
                (BestScore) = TimeEfficientAlignment(seq0,seq1,SeqIndex0,SeqIndex1,startLocs, founderPercID)

                if BestScore=="Infinity":
                    continue
                NLoc0=len(sequencesToCompare[SeqIndex0])-start0-1
                NLoc1=len(sequencesToCompare[SeqIndex1])-start1-1

                #remap to the correct location
                if startLocs[SeqIndex0] > NLoc0:
                    startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0)%len(sequencesToCompare[SeqIndex0])
                else:
                    startLocs[SeqIndex0]=(startLocs[SeqIndex0]+start0+1)

                if startLocs[SeqIndex1] > NLoc1:
                    startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1)%len(sequencesToCompare[SeqIndex1])
                else:
                    startLocs[SeqIndex1]=(startLocs[SeqIndex1]+start1+1)

                #if we found a good score get all the info necessary to
                #continue, including a PWM and updating the startLocs
                if BestScore != "Infinity":
                    index0 = startLocs[SeqIndex0]
                    index1 = startLocs[SeqIndex1]
                    seq0 = sequencesToCompare[SeqIndex0]
                    seq1 = sequencesToCompare[SeqIndex1]
                    motif0 = seq0[index0:index0+sizeOfMotif]
                    motif1 = seq1[index1:index1+sizeOfMotif]
                    BestMotifSeqs=[motif0,motif1]
                    PWM = convert2PWM([motif0, motif1],sizeOfMotif)

                    #this PWM will be used in order to establish a criterion
                    #for future sequences, whether they meet a threshold
                    #similiarty to the founder sequences

            #if it exists, find the best scoring motifs in the remaining
            #sequences.

            for seqCompIndex in xrange(2, len(sequencesToCompare)):
                maxScores=[max(PWM[P_I]) for P_I in range(len(PWM))]
                BestScore = "Infinity"
                seq=CompareList[seqCompIndex]
                sequenceToAdd = sequencesToCompare[seq]
                max_i=len(sequenceToAdd)-sizeOfMotif
                bestMismatch = NumOfMismatch
                #to ensure that the startpoint is random we will rearrange the 
                #sequences
                start_i = randint(0,max_i-1)
                seq_i = ''.join([sequenceToAdd[start_i+1:],'N',sequenceToAdd[:start_i+sizeOfMotif]])
                i=0
                while (i<=max_i):
                    motif=seq_i[i:i+sizeOfMotif]
                    #skip this area if there are any masked regions found
                    nLoc=motif.rfind('N')
                    if nLoc >=0:
                        i+=nLoc+1
                        continue

                    mmNum = 0
                    j = 0
                    while (j < sizeOfMotif):
                        mmNum+=int(PWM[j][NTDIndices[motif[j]]]!=maxScores[j])
                        if mmNum > bestMismatch:
                            break
                        j+=1

                    #set a new best if we found it (always keep originals)
                    if mmNum < bestMismatch:
                        bestMismatch = mmNum
                        startLocs[seq]=i

                    #we cannot improve upon a perfect match so we break end our
                    #search if this occurs
                    if mmNum == 0:
                        startLocs[seq]=i
                        break
                    i+=1

                #if somethigng was found we add it to our PWM
                if startLocs[seq]!= -1:
                    NLoc_i=len(sequenceToAdd)-start_i-1
                    if startLocs[seq] > NLoc_i:
                        startLocs[seq]=(startLocs[seq]+start_i)%len(sequenceToAdd)
                    else:
                        startLocs[seq]=(startLocs[seq]+start_i+1)

                    starti=startLocs[seq]
                    sAdd=sequencesToCompare[seq][starti:starti+sizeOfMotif]
                    checkPWM=add2PWMReturn(sAdd,PWM)
                    add=True
                    checkMaxScores=[max(checkPWM[P_I]) for P_I in range(len(checkPWM))]
                    for seq in BestMotifSeqs:
                        mismatches=0
                        for i in range(len(seq)):
                            mismatches+=int(checkPWM[i][NTDIndices[seq[i]]]!=checkMaxScores[i])
                            if mismatches > NumOfMismatch:
                                add=False
                                break
                    if add:
                        BestMotifSeqs.append(sAdd)
                        PWM=copy.deepcopy(checkPWM)
                        numIncluded+=1 
                    elif model=="oops":
                        break
                #if we require one occurence per sequence then if we do not
                #have a motif in this sequence we stop our search
                elif model=="oops":
                    break

            GroupScore =0
            #if we have an oops model and not all sequences are being included,
            #we have to continue without recording what we have so far
            if model=="oops" and numIncluded<numOfSequences:
                continue

            #obtain motif scores        
            GroupScore=0
            for i in range(len(PWM)):
                GroupScore += max(PWM[i])

            #store this as the best location if it is the best of this set of 
            #motifs part of that if statement below
            if ((OverallBest == "Infinity" and GroupScore != "Infinity" )or \
                GroupScore > OverallBest):
                nextToCompare = sequencesToCompare[:]
                MotifSeqs = []
                OrderOfBest = CompareList[:]
                BestIndices = startLocs[:]
                bestMotifSeqYet = BestMotifSeqs
                OverallBest = GroupScore
                PSFM = convert2PSFM(PWM, numIncluded)
                OverallPWM=PWM[:]
                for i in xrange(len(sequencesToCompare)):
                    SeqIndexi = CompareList[i]
                    #some sequences may not contain the motifs, if so you do no
                    #want to include them
                    if BestIndices[SeqIndexi] != -1:
                        sAdd=sequencesToCompare[SeqIndexi]
                        sAddI=BestIndices[SeqIndexi]
                        Motifi = sAdd[sAddI:sAddI+sizeOfMotif]
                        MotifSeqs.append(Motifi)

                        #mask out original location
                        nextToCompare[SeqIndexi]=''.join([sAdd[:sAddI],INSERTION_N*(sizeOfMotif),sAdd[sAddI+sizeOfMotif:]])
                        sAdd=nextToCompare[SeqIndexi]
                        sAddLen=len(sAdd)
                        nextToCompare[SeqIndexi]=''.join([sAdd[:sAddLen-sAddI-sizeOfMotif],INSERTION_N*(sizeOfMotif),sAdd[sAddLen-sAddI:]])
            TopMotif = convert2motif(MotifSeqs, sizeOfMotif)

        #if not more motifs are found (best score is -1) then it is likely that
        #we have found as many as we can
        if OverallBest == "Infinity":
            print "No more motifs were found!"
            return (AllPSFMS, AllPSFMseqs)

        #if a motif was found report it here and add it to the total motifs
        #Print out your results if any were reached!
        for i in xrange(len(sequencesToCompare)):
            SeqIndexi = OrderOfBest[i]            
            if BestIndices[SeqIndexi] != -1:
                Motifi = sequencesToCompare[SeqIndexi][BestIndices[SeqIndexi]:BestIndices[SeqIndexi]+sizeOfMotif]
                MotifSeqs.append(Motifi)

        #add the current best motif to the total of all motifs for returning
        #to the user
        AllPSFMS.append(PSFM)
        AllPSFMseqs.append(bestMotifSeqYet)
        print OverallPWM

        NTDSTUFF = {0:'A',1:'C',2:'G',3:'T'}
        for Pntd in [0,1,2,3]:
            print ""
            print NTDSTUFF[Pntd],
            for Pj in xrange(len(PSFM)):
                print "\t %.003f"%(PSFM[Pj][Pntd]),

        print "\n**********\n"
        print TopMotif
        print "\n**********\n"
        print "Score:",OverallBest 
        maxScores=[max(OverallPWM[P_I]) for P_I in range(len(OverallPWM))]
        for maski in xrange(numOfSequences):
            maskThis=nextToCompare[maski]
            i = 0
            max_i = len(maskThis)-sizeOfMotif
            while (i<=max_i):
                motif=maskThis[i:i+sizeOfMotif]
                #skip this area if there are any masked regions found
                nLoc=motif.rfind('N')
                if nLoc >=0:
                    i+=nLoc+1
                    continue
                mmNum = 0
                j = 0
                while (j < sizeOfMotif):
                    if PWM[j][NTDIndices[motif[j]]]!=maxScores[j]:
                        mmNum+=1
                        if mmNum > NumOfMismatch:
                            break
                    j+=1
                #if the location is below threshold we mask it out also
                if mmNum>NumOfMismatch:
                    i+=1
                    continue

                i+=1

        #replace old sequences with the new one
        sequencesToCompare = nextToCompare[:]
        #reduce masked regions to single N
        for i in xrange(len(sequences)):
            splitByN=sequencesToCompare[i].split('N')
            j = 0;
            finalSequence=""
            max_j=len(splitByN)
            while j  < max_j :
                if len(splitByN[j])==0:
                    finalSequence="".join([finalSequence,'N'])
                    while len(splitByN[j])==0:
                        j+=1
                        if j==max_j:
                            break
                else:
                    finalSequence="".join([finalSequence,splitByN[j]])
                    j+=1
            sequencesToCompare[i]=finalSequence

    return (AllPSFMS, AllPSFMseqs)


#MaskLocation
#input:     sequence    sequence to be masked
#	    location    location at which to mask the sequence
#           length      length of area to mask
#output     string      masked sequence
def MaskLocation(sequence,start,length):
    finalsequence=''.join([sequence[:start],INSERTION_FILLER*(length),sequence[start+length:]])
    newLen=len(finalsequence)
    finalsequence=''.join([finalsequence[:newLen-start-length],INSERTION_FILLER*(length),finalsequence[newLen-start:]])

    return finalsequence


#TimeEfficientAlignment
#input:     seq0        string sequence in which to calculate a best location
#           seq1        string second sequence
#           start0      integer start location for alignment (gives a skew)
#           start1      integer start for second sequence
#           BestScore   float which is the current bestscore
#           percID   percent of the aligned sequences that must be identical
#                       in order to be considered for a motif
#output     float       Best scoring alignment's score
#
#will be calculated as an ungapped consensus between two sequences as the
#algorithm steps through each window
def TimeEfficientAlignment(seq0, seq1, SeqIndex0, SeqIndex1, startLocs, percID):
    global sizeOfMotif
    NumOfMismatch = sizeOfMotif-ceil((percID)/100.0*sizeOfMotif)
    bestMismatch = NumOfMismatch
    MW1=MARKOV_WINDOW-1

    #create shifted score, then check to see if it meets all
    #criteria to become the best.

    i = 0
    maxi=len(seq0)-sizeOfMotif
    maxj=len(seq1)-sizeOfMotif

    Score="Infinity"

    while (i<=maxi):
        motif=seq0[i:sizeOfMotif+i]
        Nloc=motif.rfind('N')
        if Nloc>=0:
            i+=Nloc+1
            continue
        j=0
        while (j<=maxj):
            Nloc=seq1[j:j+sizeOfMotif].rfind('N')
            if Nloc>=0:
                j+=Nloc+1
                continue
            mismatchedNTDs=0
            for k in xrange(sizeOfMotif):
                if motif[k]!=seq1[k+j]:
                    mismatchedNTDs+=1
                    if mismatchedNTDs > bestMismatch:
                        break
            if mismatchedNTDs == 0:
                Score = 2*sizeOfMotif-bestMismatch
                startLocs[SeqIndex0]=i
                startLocs[SeqIndex1]=j
                return (Score)
            if mismatchedNTDs < bestMismatch:
                bestMismatch = mismatchedNTDs
                Score = 2*sizeOfMotif-bestMismatch
                startLocs[SeqIndex0]=i
                startLocs[SeqIndex1]=j
            j+=1
        i+=1
    return (Score)


#AlignmentScore
#input:     sequenceToCompare   List of sequences whose substrings will be 
#           sizeOfMotif         aligned integer size of the motif being found 
#                               (length of the subseqeunce that will be aligned
#                                from each sequence in above list (window size)
#           startLocs           start locations of the motifs to be aligned to
#                               each other
#           CompareList         the indices of sequencesToCompare to be aligned
#           numSeqs             the number of sequences -1 from 
#                               sequencesToCompare to be aligned. ie, the indices
#                               of sequenceToCompare stored in the first numSeqs
#                               indices of in CompareList.
#           Frequency           markov model being used to calculate background
#           originalSeqs        contain the unmasked sequences used for checking
#                               the markov score
#           NumOfMismatch	number of mismatches allowable
#output:    integer             Score indicating the consensus score of these
#                               sequences
#           2D-List             contains the PSFM
#           2D-List             contains the log markov scores
#
#will be calculated as an ungapped consensus between those elements in the 
#CompareList Consensus score is calculated by choosing the largest number of the 
#same elements in each column, and adding all these numbers up across all
#columns
def AlignmentScore(sequencesToCompare, sizeOfMotif, startLocs, CompareList, numSeqs, Frequency, originalSeqs, MARKOV_WINDOW,NumOfMismatch):
    TotalScore = 0;
    Scores = []
    MW1=MARKOV_WINDOW-1
    PWM = []
    Log = []
    ConsensusScore = 0
    len(sequencesToCompare)
    #traverse each column individually
    for i in xrange (sizeOfMotif):
        divisor = 0.0
        PWMi = [0.0, 0.0, 0.0, 0.0]
        Logi = [0.0, 0.0, 0.0, 0.0]
        #traverse this index in each sequence        
        for j in xrange(numSeqs+1):
            SequenceIndex = CompareList[j]
            CurrSeq = sequencesToCompare[SequenceIndex]
            CurrOr = originalSeqs[SequenceIndex]
            #some sequences may not contain the motifs, if so you do not want 
            #to include them in the consensus. These have uninitialized start
            #locations (ie startLocs would be -1
            if startLocs[SequenceIndex] != -1:
                divisor += 1.0
                if sequencesToCompare[SequenceIndex][startLocs[SequenceIndex]+i] == 'N':
                    print sequencesToCompare
                    print "\nBAD HERE!"
                    print CurrSeq
                    print startLocs
                    print startLocs[SequenceIndex]
                    print CompareList
                    print j
                    print SequenceIndex
                    print numSeqs
                    print sizeOfMotif
                    print CurrSeq[startLocs[SequenceIndex]:startLocs[SequenceIndex]+sizeOfMotif]

                PWMi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += 1.0
                Logi[NTDIndices[CurrSeq[startLocs[SequenceIndex]+i]]] += Frequency[CurrOr[startLocs[SequenceIndex]-MARKOV_WINDOW+i+1:startLocs[SequenceIndex]+i+1]]

        Scores.append(0)
        Top = -1
        for NTD in ['A', 'C', 'G', 'T']:
            #avoid ln(0) errors
            if PWMi[NTDIndices[NTD]] > 0:
                Scores[i] += PWMi[NTDIndices[NTD]]/divisor *log(PWMi[NTDIndices[NTD]]/(divisor*Logi[NTDIndices[NTD]]/PWMi[NTDIndices[NTD]]), e)
            if PWMi[NTDIndices[NTD]] > Top:
                Top = PWMi[NTDIndices[NTD]]
        TotalScore += Scores[i]
        ConsensusScore += Top
        PWM.append(PWMi)
        Log.append(Logi)

    return (TotalScore, Scores, ConsensusScore,PWM, Log)


#MarkovFreq
#input:     prefix      string of length MARKV_SIZE - 1 prefix used for model
#           actualNTD   character NTD at the en dof the prefix being calculated
#           Frequency   Markov model for calculations
#output:    float that gives the markov score for this specific sequence
#
#The helper function will run through and find all possible words with the
#prefix and determine the markov score based on this
def MarkovFreq (prefix, actualNTD, Frequency):

    denominator = 0.0
    numerator = 0.0
    for NTD in ['A', 'C', 'G', 'T']:
        value = M_Score(prefix+NTD, Frequency, False,MARKOV_WINDOW)
        if NTD == actualNTD :
            numerator = value
        denominator += value
    retVal = numerator/denominator
    return retVal


#revComp
#input:     sequence    DNA sequence to be converted to reverse complement
#output:    string      reverse complement of input sequence
#
#obtains the reverse complement of an input sequence
def revComp (sequence):
    #base pairs
    RevDict={'A':'T','T':'A', 'C':'G', 'G':'C', 'N':'N'}
    reverse = ""
    #reverse the sequene
    for i in xrange(len(sequence)):
        reverse = RevDict[sequence[i].upper()]+reverse
    return reverse


#Markov3
#input:     sequences   list that are being used to create the background
#output:    dictionary of all 6mers (reverse complement also) and their -log2
#           proportion seen
#
#background will build a markov model of the background in order to be able     
#to differentiate the motifs from the pure background of a certain size
#they will be stored as -log(fraction)
def Markov(sequences, IncludeRC,MARKOV):
    MARKOV_WINDOW = MARKOV + 1
    WordDict = {}
    totalWindows = 0

    #take each sequence and use it in order to separately determine background
    for seq in sequences:
        #all sequences for the background must be full-length            
        for index in xrange(len(seq)-MARKOV):
            subseq = seq[index:index+MARKOV_WINDOW].upper()
            if "N" in subseq:
                continue

            totalWindows += 1
            if subseq not in WordDict:
                WordDict[subseq] = 0.0
            WordDict[subseq] += 1.0
            if IncludeRC:
                totalWindows += 1
                RC = revComp(subseq)
                if RC not in WordDict:
                    WordDict[RC] = 0.0
                WordDict[RC] += 1.0

    #convert to logs                
    for key in WordDict:
        WordDict[key] = 1.0*WordDict[key]/totalWindows
    return WordDict    


#Average_M
#input:     sequences   List of sequences on which to find the average
#                       markov score
#           Model       Dictionary containing pvalues for seeing 3mers
#           l           integer designating the word sizes from which to
#                       determine average pvalue
#output:    average probability of all input lmers in the sequences in the Model
#
#finds the probability of seeing all subsequence in the total strings
#using the markov model created using the background. Markov3 is used
#(window size of 3) and from this determine the average. This functio will
#also screen the background model
def Average_M (sequence, Model, l,MARKOV_WINDOW):
    totalSum = 0.0;
    totalWords = 0.0;
    MW1=MARKOV_WINDOW-1
    for seq in sequence:
        for i in xrange(MW1,len(seq)-l+1):
            sequenceCheck=seq[i-MW1:i+1]
            Nindex=sequenceCheck.rfind('N')
            if Nindex >= 0:
                continue
            totalWords += 1.0     #increase number of words
            PVal = M_Score(sequenceCheck, Model, True,MARKOV_WINDOW)
            #add current word to the running total of scores
            totalSum += PVal
    retVal = totalSum/totalWords
    print totalWords

    return retVal


#M_Score
#input:     sequence    string for which the Pvalue is to be determined
#           Model       Dictionary containing log2 pvalues for seeing 6mers
#           check       Boolean which determines whether to also check for
#                       completeness of markov model
#output:    log2 probability of seeing the input seqeunce in the Model
#
#gives the probability of seeing the given subsequence in the total strings
#using the markov model created using the background. Markov6 is used
#(window size of 3)
def M_Score (sequence, Model, check,MARKOV_WINDOW):
    PVal = 0.0
    MW1=MARKOV_WINDOW-1
    for j in xrange(len(sequence)-MARKOV_WINDOW+1):
        #if the subsequences is not in the background value it
        #the program returns an exit code and asks the user to
        #revise background model input
        if sequence[j:j+MARKOV_WINDOW] not in Model:
            if check:
                print "The Markov Model is inadequate for your input",
                print "sequences\n %s is"%sequence[j:j+MARKOV_WINDOW],
                print "not contained in model provided\n",
                print "Please revise your provided model or consider",
                print "using Background Modelling provided"
                sys.exit(0)
            continue
        #calculates score
        PVal += -log(Model[sequence[j:j+MARKOV_WINDOW]],e)

    return PVal

#LogOdds
#input:     sequences   sequences for which to find the log odds score
#           startLocs   start locations at which to start computing score
#           sizeOfMotif size of the motif being created
#           Freqeuncy   Markov3 Model
#           Index       Current Indices
#           CompareList Order of Comparison
#output     returns the log odds score for the consensus
#the equation used is as follow:
#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
def LogOddsoPT(sequences, startLocs, sizeOfMotif, Frequency, Index,CompareList,MARKOV_WINDOW):
    MW1=MARKOV_WINDOW-1
    for i in range(sizeOfMotif):
        for j in range(Index+1):
            seqIndex = CompareList[j]
            lnValue = [0.0, 0.0, 0.0, 0.0]
            totalNum = [0.0, 0.0, 0.0, 0.0]
            #add the frequency of seeing the given nucleotides in each position
            #a runnning total of each one. Then add them to the equation
            if startLocs[seqIndex] != -1:
                index = startLocs[seqIndex]+i
                previous = sequences[seqIndex][index-(MW1):index+1]
                denominator = 0.0
                numerator = 0.0
                #determine probabilities for each path
                for NTD in ['A', 'C', 'G', 'T']:
                    full = previous+NTD
                    if full in Frequency:
                        value = Frequency[full]
                        if NTD == sequences[seqIndex][index+1]:
                            numerator = value
                        denominator += value
                #increase the summation of each location
                lnValue[NTDIndices[sequences[seqIndex][index+1]]] +=numerator/denominator
                #increase number of given nucleotides at the index
                totalNum[NTDIndices[sequences[seqIndex][index+1]]] += 1.0

 
#LogOdds
#input:     sequence    relevant part of the sequence being added to the PWM
#           PWM         information on sequences already in the motif
#           LogsM       frequency information on sequences already in motif
#           Frequency   markov model for background
#           sizeOfMotif size f the motif being found
#output     returns the log odds score for the consensus
#the equation used is as follow:
#S(j = 1 to sizeOfMotif (S(i = [A,C,G,T]) f_ij * ln(S(Prob each path))))
def LogOdds(PWM, LogsM, sequence, Frequency,MARKOV_WINDOW):
    Score = 0
    PWMout = copy.deepcopy(PWM)
    LogsMout = copy.deepcopy(LogsM)
    MW1=MARKOV_WINDOW-1
    #since each column of the PWM must add up to the total umber of sequences
    #in that PWM, in addition one must be added for the current sequence
    totalSeqs = PWM[0][0]+PWM[0][1]+PWM[0][2]+PWM[0][3] + 1
    for j in range(len(PWMout)):
        for i in ['A', 'C', 'G', 'T']:
            if i == sequence[j+MW1]:
                PWMout[j][NTDIndices[i]] += 1.0
                #prefix to check for prob of specific ntd given prefix
                word = sequence[j:j+MARKOV_WINDOW]
                #determine the frequency of the words individually                     
                LogsMout[j][NTDIndices[i]] += Frequency[word]   
            if PWMout[j][NTDIndices[i]]> 0:
                Score +=PWMout[j][NTDIndices[i]]/totalSeqs*log(PWMout[j][NTDIndices[i]]/(totalSeqs*LogsMout[j][NTDIndices[i]]/PWMout[j][NTDIndices[i]]),e)

    return Score, PWMout, LogsMout


#convert2motif
#input:     sequences   list containing the sequences in A,C,G,T alphabet 
#                       comprising the motif to be converted into symbols
#           size        size of the motif
# maybe add threshold?!?!
#output:    string      motif converted into descriptive symbols
#
#takes in a list of motifs that were found at each point and converts them to
#an actual motif
def convert2motif(sequences, size):
    #column composition is replaced by symbols
    SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
                  'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
                  'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
    Motif = ""
    #determine the composition of each column
    numSeqs = len(sequences)
    for i in xrange(size):
        A = 0.
        C = 0.
        G = 0.
        T = 0.       
        for seq in sequences:
            if seq[i].upper() == "A":
                A += 1
            elif seq[i].upper() == "C":
                C += 1
            elif seq[i].upper() == "G":
                G += 1
            else:
                T += 1

        characterCode = ""
        if (A/numSeqs > 0.1):
            characterCode += "A"
        if (C/numSeqs > 0.1):
            characterCode += "C"
        if (G/numSeqs > 0.1):
            characterCode += "G"
        if (T/numSeqs > 0.1):
            characterCode += "T"
        Motif += SymbolDict[characterCode]

    return Motif


#PMW2motif
#input:     array       PWM
#output:    string      motif converted into descriptive symbols
#
#takes in a PWM that was created and converts it to 
#an actual motif
def PWM2Motif(PWM):
    #column composition is replaced by symbols
    SymbolDict = {'CGT':'B','AGT':'D','ACT':'H','GT':'K','AC':'M',
                  'ACGT':'N','AG':'R','CG':'S','ACG':'V','AT':'W',
                  'CT':'Y','A':'A','C':'C','G':'G','T':'T'}
    Motif = ""
    #determine the composition of each column
    for i in xrange(len(PWM)):
        characterCode = ""
        if (PWM[i][NTDA] > 0.1):
            characterCode += "A"
        if (PWM[i][NTDC] > 0.1):
            characterCode += "C"
        if (PWM[i][NTDG] > 0.1):
            characterCode += "G"
        if (PWM[i][NTDT] > 0.1):
            characterCode += "T"
        Motif += SymbolDict[characterCode]

    return Motif


#convert2PSFM
#input:     sequences   list containing the sequences in A,C,G,T alphabet 
#                       comprising the motif to be converted into symbols
#           size        size of the motif
#output:    2Darray     will contain the PSFM where indices 0-3 of each list 
#                       will be A,C,G,T respectively
#
#takes in a list of motifs that were found at each point and converts them to
#a PSFM
def convert2PSFM (PWM, NumOfSeqs):
    #Position specific frequency matrix to be returned
    PSFM = []
    #determine the composition of each column
    for i in xrange(len(PWM)):
        index = []
        #get frequencies for each NTD
        for j in [0,1,2,3]:
            index.append(PWM[i][j]/NumOfSeqs)
        #add all nucleotide frequencies to the columns
        PSFM.append(index)

    return PSFM


#convert2PWM
#input:     sequences   list containing the sequences in A,C,G,T alphabet 
#                       comprising the motif to be converted into symbols
#           size        size of the motif
#output:    2Darray     will contain the PSFM where indices 0-3 of each list 
#                       will be A,C,G,T respectively
#
#takes in a list of motifs that were found at each point and converts them to
#a PWM
def convert2PWM (sequences, size):
    #Position specific frequency matrix to be returned
    PWM = []
    #determine the composition of each column
    for i in xrange(size):
        indices = [0.0, 0.0, 0.0, 0.0]
        for seq in sequences:
            indices[NTDIndices[seq[i].upper()]] += 1.0
        #add all nucleotide frequencies to the columns
        PWM.append(indices)

    return PWM


#add2PWM
#input:     sequence    sequence to be added to PWM
#           PWM         PWM being modiifed         
#
#mutator method takes in a sequence and adds it to the PWM
def add2PWM(sequence, PWM):
    #determine the composition to add
    for i in xrange(len(PWM)):
        PWM[i][NTDIndices[sequence[i].upper()]] += 1.0


#add2PWMReturn
#input:     sequence    sequence to be added to PWM
#           PWM         PWM being modiifed         
#
#mutator method takes in a sequence and adds it to the PWM
def add2PWMReturn(sequence, PWM):
    retPWM=copy.deepcopy(PWM)
    #determine the composition to add
    for i in xrange(len(retPWM)):
        retPWM[i][NTDIndices[sequence[i].upper()]] += 1.0

    return retPWM


#Align2PWM
#input:     Motifi      Sequence being aligned to PWM
#           PWM         PWM to which the sequence is being aligned
#output:    float       alignment score to the matrix
#
#takes in a PWM and aligns the sequence to this PWM and returns
#the consensus scoreo
def Align2PSFM(Motifi,PWM):
    Best = 0.0
    for i in xrange(len(PWM)):
        CurrentIndex = PWM[i][:]
        CurrentIndex[NTDIndices[Motifi[i].upper()]] += 1.0
        Top = CurrentIndex[0]
        for j in [1,2,3]:
            if Top < CurrentIndex[j]:
                Top = CurrentIndex[j]
        Best += Top

    return Best


#Factorial
#input:     n       Number for which we will calculate the factorial
#output:    float   factorial of input number
#
#calculates the factorial of input number n
def Factorial(n):
    #by definition factorial should be 1
    Fact = 1
    for i in xrange(2,n+1):
        Fact *= i

    return Fact


#Choose
#input:     n       integer for total number of elements in a set
#           k       integer for total number of elements to be chose from set
#output:    float   the binomial coefficient of the above number, ie nCk
#
#calculates the number of ways to choose k elements from a set of n
def Choose(n, k):
    return Factorial(n)/Factorial(n-k)/Factorial(k)


#GetMotif
#input: sequencesToCompare  sequences where the motif is being found
#       sequencesLast       original version of the sequences
#       CompareList         Order in which sequences are examined
#       Best Motif          The location where the motifs score highest
#       startLocs           The current locations for the motifs being examined
#       sizeOfMotif         Size of the motif to be found in the sequences
#       Index               Index up to which the motif has been optimized so
#                           far
#       BestScore           The best score of aligned motifs
#       Frequency           Markov3 Model
#       PWM                 PWM which includes alll the sequences which have
#                           been included so far
#       PWMFounder          This is the "founder" PWM, includes only the
#                           founder sequences' information
#       ConScoreTop         This value stores the Consensus score of the
#                           founder sequences only
#       normalization       normalization value for the current words
#       LogsM               frequency of markov words
#       percID              percent of the aligned sequences that must be 
#                           identical in order to be considered for a motif to
#                           be considered (between the founder sequences then
#                           between founder sequence consensus and each other
#                           subsequence being considered)
#       originalSeqs        contain unmasked sequence for markov scores
#output:integer             Score of the top scoring motif
#
#this function will align the Indexth item in the sequencesToCompare to all the
#sequences previously aligned from their best indices in a way to optimize the
#alignment score (consensus score)

def GetMotif (sequencesToCompare, sequencesLast, CompareList, BestMotif,
              startLocs, sizeOfMotif, Index, BestScore, Frequency,
              PWM, PWMFounder, ConScoreTop, normalization, LogsM,percID,
              originalSeqs,origLast,MARKOV_WINDOW):

    MW1=MARKOV_WINDOW-1
    #variable to indicate whether sequencesLast needs to be updated
    NewBestDetermined = False
    SeqIndexi = CompareList[Index]

    #search through all positions for this index
    starti=MW1
    endi=len(sequencesToCompare[SeqIndexi])-sizeOfMotif
    while (starti<=endi):
        Motifi = sequencesToCompare[SeqIndexi][starti:starti + sizeOfMotif]
        MMotifi = originalSeqs[SeqIndexi][starti-MW1:starti + sizeOfMotif]

        #if this area has already been earmarked as a motif it cannot be
        #used again
        Nlocation=Motifi.rfind('N')
        if Nlocation >= 0:
            starti+=1+Nlocation+MARKOV_WINDOW

        #check to see if the current sequence has a high enough consensus
        #with the founder sequences in order to be considered further
        ScoreTop = Align2PSFM(Motifi, PWMFounder)
        Poss = float(ScoreTop - ConScoreTop)
        GOF = Poss/sizeOfMotif
        if GOF < percID/100.0:
            starti+=1
            continue

        #if the word is not above noise
        if M_Score(MMotifi,Frequency, False,MARKOV_WINDOW) - normalization < 0:
            starti+=1
            continue

        startLocs[SeqIndexi] = starti
        #new best scores are assigned by aligning the current sequence to
        #the PWM
        (ScoreAll, PWM2, LogsM2) = LogOdds(PWM, LogsM, originalSeqs[SeqIndexi] [starti-MW1:starti+sizeOfMotif],Frequency,MARKOV_WINDOW)
        Score = ScoreAll

        #new best scores are evaluted
        if (BestScore == "Infinity" and Score != "Infinity") or Score > BestScore :
            #record best position
            BestMotif[SeqIndexi] = startLocs[SeqIndexi]
            BestScore = Score
            LogsMhold = LogsM2[:]
            PWMhold = PWM2[:]
            NewBestDetermined = True
    
        starti+=1

    startLocs[SeqIndexi] = BestMotif[SeqIndexi]
    if NewBestDetermined:
        PWM = PWMhold[:]
        LogsM = LogsMhold[:]

    return (BestScore, NewBestDetermined, PWM, LogsM)