##################################
#                                #
# Last modified 2019/04/07       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import re
import sys
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T',
           'T':'A',
           'G':'C',
           'C':'G',
           'N':'N',
           'R':'N',
           'Y':'N',
           'S':'N',
           'W':'N',
           'K':'N',
           'M':'N',
           'B':'N',
           'D':'N',
           'H':'N',
           'V':'N',
           'a':'t',
           't':'a',
           'g':'c',
           'c':'g',
           'n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s sequence.fa minORFLength outfileprefix [-NonStandardGeneticCode 2|3|4|5|6|9|10|12|13|14|16|21|22|23|24|25] [-NonStandardInitiation species] [-6frame]' % sys.argv[0]
        print 'Nonstandard genetic codes:'
        print '\t2. The Vertebrate Mitochondrial Code'
        print '\t3. The Yeast Mitochondrial Code'
        print '\t4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code'
        print '\t5. The Invertebrate Mitochondrial Code'
        print '\t6. The Ciliate, Dasycladacean and Hexamita Nuclear Code'
        print '\t9. The Echinoderm and Flatworm Mitochondrial Code'
        print '\t10. The Euplotid Nuclear Code'
        print '\t12. The Alternative Yeast Nuclear Code'
        print '\t13. The Ascidian Mitochondrial Code'
        print '\t14. The Alternative Flatworm Mitochondrial Code'
        print '\t16. Chlorophycean Mitochondrial Code'
        print '\t21. Trematode Mitochondrial Code'
        print '\t22. Scenedesmus obliquus Mitochondrial Code'
        print '\t23. Thraustochytrium Mitochondrial Code'
        print '\t24. Pterobranchia Mitochondrial Code'
        print '\t25. Candidate Division SR1 and Gracilibacteria Code'
        print '\tAlternative initiation codons:'
        print '\tTrypanosoma: UUA, UUG, CUG'
        print '\tLeishmania: AUU, AUA'
        print '\tTertrahymena: AUU, AUA, AUG'
        print '\tParamecium: AUU, AUA, AUG, AUC, GUG, GUA'
        print '\tif the [-6frame] option is used, for transcript where no ORF satisfying the minimum length requirement has been found the longest valid translations of the whole transcript from either end will be printed'
        sys.exit(1)

    fasta = sys.argv[1]
    minORFlength = int(sys.argv[2])
    outprefix = sys.argv[3]

    doSixFrame = False
    if '-6frame' in sys.argv:
        doSixFrame = True

    CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
               'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
               'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
               'AAA':'K', 'AAG':'K',
               'AAU':'N', 'AAC':'N',
               'AUG':'M',
               'GAU':'D', 'GAC':'D',
               'UUU':'F', 'UUC':'F',
               'UGU':'C', 'UGC':'C',
               'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
               'CAA':'Q', 'CAG':'Q',
               'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S',
               'GAA':'E', 'GAG':'E',
               'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
               'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
               'UGG':'W',
               'CAU':'H', 'CAC':'H',
               'UAU':'Y', 'UAC':'Y',
               'AUU':'I', 'AUC':'I', 'AUA':'I',
               'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V',
               'START':'AUG',
               'UAA':'STOP',
               'UGA':'STOP',
               'UAG':'STOP'}
    STOPCODONREGEXP = '(UGA|UAA|UAG)'

###### http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG6

    if '-NonStandardGeneticCode' in sys.argv:
        NSGC = sys.argv[sys.argv.index('-NonStandardGeneticCode') + 1]
        print 'Will use nonstandard genetic code', NSGC
        if NSGC == '2':
            CodonDict['AGA'] = 'STOP'
            CodonDict['AGG'] = 'STOP'
            CodonDict['AUA'] = 'M'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(AGA|AGG|UAA|UAG)'
        if NSGC == '3':
            CodonDict['AUA'] = 'M'
            CodonDict['CUU'] = 'T'
            CodonDict['CUC'] = 'T'
            CodonDict['CUA'] = 'T'
            CodonDict['CUG'] = 'T'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '4':
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '5':
            CodonDict['AGA'] = 'S'
            CodonDict['AGG'] = 'S'
            CodonDict['AUA'] = 'M'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '6':
            CodonDict['UAA'] = 'Q'
            CodonDict['UAG'] = 'Q'
            STOPCODONREGEXP = '(UGA)'
        if NSGC == '9':
            CodonDict['AAA'] = 'N'
            CodonDict['AGA'] = 'S'
            CodonDict['AGG'] = 'S'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '10':
            CodonDict['UGA'] = 'C'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '12':
            CodonDict['CUG'] = 'S'
        if NSGC == '13':
            CodonDict['AGA'] = 'G'
            CodonDict['AGG'] = 'G'
            CodonDict['AUA'] = 'M'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'
        if NSGC == '14':
            CodonDict['AAA'] = 'N'
            CodonDict['AGA'] = 'S'
            CodonDict['AGG'] = 'S'
            CodonDict['UAA'] = 'Y'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAG)'
        if NSGC == '16':
            CodonDict['TAG'] = 'L'
        if NSGC == '21':
            CodonDict['TGA'] = 'W'
            CodonDict['ATA'] = 'M'
            CodonDict['AGA'] = 'S'
            CodonDict['AGG'] = 'S'
            CodonDict['AAA'] = 'N'
        if NSGC == '22':
            CodonDict['TCA'] = 'STOP'
            CodonDict['TAG'] = 'L'
        if NSGC == '23':
            CodonDict['TTA'] = 'STOP'
        if NSGC == '24':
            CodonDict['AGA'] = 'S'
            CodonDict['AGG'] = 'K'
            CodonDict['UGA'] = 'W'
            STOPCODONREGEXP = '(UAA|UAG)'

    STARTCODONREGEXP = '(AUG)'
    if '-NonStandardInitiation' in sys.argv:
        species = sys.argv[sys.argv.index('-NonStandardInitiation') + 1]
        print 'Will use alternative initiation codons:', species
        if species == 'Trypanosoma':
            STARTCODONREGEXP = '(AUG|UUA|UUG|CUG)'
        if species == 'Leishmania':
            STARTCODONREGEXP = '(AUG|AUU|AUA)'
        if species == 'Tertrahymena':
            STARTCODONREGEXP = '(AUG|AUU|AUA|AUG)'
        if species == 'Paramecium':
            STARTCODONREGEXP = '(AUG|AUU|AUA|AUC|GUG|GUA)'
    
    SequenceDict = {}
    inputdatafile = open(fasta)
    ID=''
    for line in inputdatafile:
        if line[0]=='>':
            if ID == '':
                ID = line.strip().split('>')[1]
            else:
                sequence = ''.join(sequence)
                SequenceDict[ID]=sequence.upper()
                ID = line.strip().split('>')[1]
            sequence=[]
        else:
            sequence.append(line.strip())   
    sequence = ''.join(sequence)
    SequenceDict[ID]=sequence.upper()

    print 'finished inputting sequence'

    outfileGTF = open(outprefix + '.gtf', 'w')
    outfileORF = open(outprefix + '.faa', 'w')

    foundORFs = 0
    for chr in SequenceDict.keys():
        FoundORF = False
        sequence = SequenceDict[chr].replace('T','U').replace('W','N').replace('S','N').replace('R','N').replace('H','N').replace('Y','N').replace('M','N').replace('K','N').replace('B','N').replace('D','N').replace('V','N')
        L = len(sequence)
        AUGpositions=[]
        m = re.compile(STARTCODONREGEXP)
        for mo in m.finditer(sequence):
            AUGpositions.append(mo.start())
        STOPpositions = {}
        m = re.compile(STOPCODONREGEXP)
        for mo in m.finditer(sequence):
            STOPpositions[mo.start()] = 1
        ORFDict = {}
        for AUG in AUGpositions:
            inORF = True
            i = AUG
            while inORF and i < L:
                i+=3
                if STOPpositions.has_key(i):
                    inORF = False
                    if i-AUG >= 3*minORFlength:
                        if ORFDict.has_key(i):
                            pass
                        else:
                            ORFDict[i] = []
                        ORFDict[i].append(AUG)
                        FoundORF = True
        STOPcodons = ORFDict.keys()
        STOPcodons.sort()
        for STOP in STOPcodons:
            START = min(ORFDict[STOP])
            foundORFs += 1
            ORF = ''
            for i in range(START,STOP,3):
                if 'N' in sequence[i:i+3]:
                    ORF += 'X'
                else:
                    ORF += CodonDict[sequence[i:i+3]]
            outline = chr + '\t' + 'predicted_ORF' + '\t' + 'CDS' + '\t' + str(START + 1) + '\t' + str(STOP + 3 + 1) + '\t.\t' + '+' + '\t' + '.' + '\t' + 'gene_id "ORF' + str(foundORFs) + '";' + ' ' + 'transcript_id "ORF' + str(foundORFs) + '";'
            outfileGTF.write(outline + '\n')
            outline = '>ORF' + str(foundORFs)
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')

        revsequence = getReverseComplement(SequenceDict[chr]).replace('T','U')
        AUGpositions=[]
        m = re.compile(STARTCODONREGEXP)
        for mo in m.finditer(revsequence):
            AUGpositions.append(mo.start())
        STOPpositions = {}
        m = re.compile(STOPCODONREGEXP)
        for mo in m.finditer(revsequence):
            STOPpositions[mo.start()] = 1
        ORFDict = {}
        for AUG in AUGpositions:
            inORF = True
            i = AUG
            while inORF and i < L:
                i+=3
                if STOPpositions.has_key(i):
                    inORF = False
                    if i-AUG >= 3*minORFlength:
                        if ORFDict.has_key(i):
                            pass
                        else:
                            ORFDict[i] = []
                        ORFDict[i].append(AUG)
                        FoundORF = True
        STOPcodons = ORFDict.keys()
        STOPcodons.sort()
        STOPcodons.reverse()
        for STOP in STOPcodons:
            START = min(ORFDict[STOP])
            foundORFs += 1
            ORF = ''
            for i in range(START,STOP,3):
                if 'N' in revsequence[i:i+3]:
                    ORF += 'X'
                else:
                    ORF += CodonDict[revsequence[i:i+3]]
            outline = chr + '\t' + 'predicted_ORF' + '\t' + 'CDS' + '\t' + str(L - STOP - 3) + '\t' + str(L - START) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "ORF' + str(foundORFs) + '";' + ' ' + 'transcript_id "ORF' + str(foundORFs) + '";'
            outfileGTF.write(outline + '\n')
            outline = '>ORF' + str(foundORFs)
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')

        if doSixFrame and not FoundORF and len(sequence) >= 3*minORFlength:
            ORF = 'X'
            for i in range(0,len(sequence)-3,3):
                if 'N' in sequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[sequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'forward_frame_1' + '\t' + 'CDS' + '\t' + str(0) + '\t' + str(len(sequence)-3) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_forward_frame_1' + '";' + ' ' + 'transcript_id "' + chr + '_forward_frame_1' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'forward_frame_1'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')
            ORF = 'X'
            for i in range(1,len(sequence)-5,3):
                if 'N' in sequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[sequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'forward_frame_2' + '\t' + 'CDS' + '\t' + str(1) + '\t' + str(len(sequence)-5) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_forward_frame_2' + '";' + ' ' + 'transcript_id "' + chr + '_forward_frame_2' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'forward_frame_2'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')
            ORF = 'X'
            for i in range(2,len(sequence)-4,3):
                if 'N' in sequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[sequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'forward_frame_3' + '\t' + 'CDS' + '\t' + str(2) + '\t' + str(len(sequence)-4) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_forward_frame_2' + '";' + ' ' + 'transcript_id "' + chr + '_forward_frame_3' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'forward_frame_3'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')
            ORF = 'X'
            for i in range(0,len(revsequence)-3,3):
                if 'N' in revsequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[revsequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'reverse_frame_1' + '\t' + 'CDS' + '\t' + str(0) + '\t' + str(len(revsequence)-3) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_reverse_frame_1' + '";' + ' ' + 'transcript_id "' + chr + '_reverse_frame_1' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'reverse_frame_1'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')
            ORF = 'X'
            for i in range(1,len(revsequence)-5,3):
                if 'N' in revsequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[revsequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'reverse_frame_2' + '\t' + 'CDS' + '\t' + str(1) + '\t' + str(len(revsequence)-5) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_reverse_frame_2' + '";' + ' ' + 'transcript_id "' + chr + '_reverse_frame_2' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'reverse_frame_2'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')
            ORF = 'X'
            for i in range(2,len(revsequence)-4,3):
                if 'N' in revsequence[i:i+3]:
                    AA = 'X'
                else:
                    AA = CodonDict[revsequence[i:i+3]]
                if AA == 'STOP':
                    ORF += 'X'
                else:
                    ORF += AA
            ORF += 'X'
#            outline = chr + '\t' + 'reverse_frame_3' + '\t' + 'CDS' + '\t' + str(2) + '\t' + str(len(revsequence)-4) + '\t.\t' + '-' + '\t' + '.' + '\t' + 'gene_id "' + chr + '_reverse_frame_2' + '";' + ' ' + 'transcript_id "' + chr + '_reverse_frame_3' + '";'
#            outfileGTF.write(outline + '\n')
            outline = '>' + chr + 'reverse_frame_3'
            outfileORF.write(outline + '\n')
            outfileORF.write(ORF + '\n')

    outfileORF.close()

run()

