##################################
#                                #
# Last modified 06/05/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n', 'r':'y', 'R':'Y', 'y':'r', 'Y':'R'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s fasta windowsize outputfilename [-CDSexclude gtf] [-NonStandardGeneticCode 2|3|4|5|6|9|10|12|13|14|16|21|22|23|24|25]' % sys.argv[0]
        print '\tNote: check the phasing of the GTF file before use!!!'
        print '\tNote: the -CDSexclude option will exclude all CDS regions with the exception of 4-fold degenerate sites'
        print '\t\tthe -NonStandardGeneticCode option only works when -CDSexclude is enabled' 
        print 'Nonstandard genetic codes:'
        print '\t2. The Vertebrate Mitochondrial Code'
        print '\t3. The Yeast Mitochondrial Code'
        print '\t4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code'
        print '\t5. The Invertebrate Mitochondrial Code'
        print '\t6. The Ciliate, Dasycladacean and Hexamita Nuclear Code'
        print '\t9. The Echinoderm and Flatworm Mitochondrial Code'
        print '\t10. The Euplotid Nuclear Code'
        print '\t12. The Alternative Yeast Nuclear Code'
        print '\t13. The Ascidian Mitochondrial Code'
        print '\t14. The Alternative Flatworm Mitochondrial Code'
        print '\t16. Chlorophycean Mitochondrial Code'
        print '\t21. Trematode Mitochondrial Code'
        print '\t22. Scenedesmus obliquus Mitochondrial Code'
        print '\t23. Thraustochytrium Mitochondrial Code'
        print '\t24. Pterobranchia Mitochondrial Code'
        print '\t25. Candidate Division SR1 and Gracilibacteria Code'
        print '\tAlternative initiation codons are not considered'
        sys.exit(1)

    CodonDict={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
               'UUA':'L', 'UUG':'L', 'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
               'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
               'AAA':'K', 'AAG':'K',
               'AAU':'N', 'AAC':'N',
               'AUG':'M',
               'GAU':'D', 'GAC':'D',
               'UUU':'F', 'UUC':'F',
               'UGU':'C', 'UGC':'C',
               'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
               'CAA':'Q', 'CAG':'Q',
               'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S', 'AGU':'S', 'AGC':'S',
               'GAA':'E', 'GAG':'E',
               'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
               'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
               'UGG':'W',
               'CAU':'H', 'CAC':'H',
               'UAU':'Y', 'UAC':'Y',
               'AUU':'I', 'AUC':'I', 'AUA':'I',
               'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V',
               'START':'AUG',
               'UAA':'STOP',
               'UGA':'STOP',
               'UAG':'STOP'}
    STOPCODONREGEXP = '(UGA|UAA|UAG)'

    CodonDict4FoldDegenerate={'GCU':'A', 'GCC':'A', 'GCA':'A', 'GCG':'L',
                              'CUU':'L', 'CUC':'L', 'CUA':'L', 'CUG':'L',
                              'CGU':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
                              'CCU':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
                              'UCU':'S', 'UCC':'S', 'UCA':'S', 'UCG':'S',
                              'ACU':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
                              'GGU':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G',
                              'GUU':'V', 'GUC':'V', 'GUA':'V', 'GUG':'V'}

    fasta = sys.argv[1]
    window = int(sys.argv[2])
    outfilename = sys.argv[3]

    doCDSEsclude = False
    if '-CDSexclude' in sys.argv:
        doCDSEsclude = True
        gtf = sys.argv[sys.argv.index('-CDSexclude') + 1]
        if '-NonStandardGeneticCode' in sys.argv:
            NSGC = sys.argv[sys.argv.index('-NonStandardGeneticCode') + 1]
            print 'Will use nonstandard genetic code', NSGC
            if NSGC == '2':
                CodonDict['AGA'] = 'STOP'
                CodonDict['AGG'] = 'STOP'
                CodonDict['AUA'] = 'M'
                CodonDict['UGA'] = 'W'
                STOPCODONREGEXP = '(AGA|AGG|UAA|UAG)'
                if NSGC == '3':
                    CodonDict['AUA'] = 'M'
                    CodonDict['CUU'] = 'T'
                    CodonDict['CUC'] = 'T'
                    CodonDict['CUA'] = 'T'
                    CodonDict['CUG'] = 'T'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '4':
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '5':
                    CodonDict['AGA'] = 'S'
                    CodonDict['AGG'] = 'S'
                    CodonDict['AUA'] = 'M'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '6':
                    CodonDict['UAA'] = 'Q'
                    CodonDict['UAG'] = 'Q'
                    STOPCODONREGEXP = '(UGA)'
                if NSGC == '9':
                    CodonDict['AAA'] = 'N'
                    CodonDict['AGA'] = 'S'
                    CodonDict['AGG'] = 'S'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '10':
                    CodonDict['UGA'] = 'C'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '12':
                    CodonDict['CUG'] = 'S'
                if NSGC == '13':
                    CodonDict['AGA'] = 'G'
                    CodonDict['AGG'] = 'G'
                    CodonDict['AUA'] = 'M'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'
                if NSGC == '14':
                    CodonDict['AAA'] = 'N'
                    CodonDict['AGA'] = 'S'
                    CodonDict['AGG'] = 'S'
                    CodonDict['UAA'] = 'Y'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAG)'
                if NSGC == '16':
                    CodonDict['TAG'] = 'L'
                if NSGC == '21':
                    CodonDict['TGA'] = 'W'
                    CodonDict['ATA'] = 'M'
                    CodonDict['AGA'] = 'S'
                    CodonDict['AGG'] = 'S'
                    CodonDict['AAA'] = 'N'
                if NSGC == '22':
                    CodonDict['TCA'] = 'STOP'
                    CodonDict['TAG'] = 'L'
                if NSGC == '23':
                    CodonDict['TTA'] = 'STOP'
                if NSGC == '24':
                    CodonDict['AGA'] = 'S'
                    CodonDict['AGG'] = 'K'
                    CodonDict['UGA'] = 'W'
                    STOPCODONREGEXP = '(UAA|UAG)'

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    if doCDSEsclude:
        listoflines = open(gtf)
        CDSDict={}
        ExcludeDict = {} # 1-based !!!!
        for line in listoflines:
            if line.startswith('#'):
                continue
            fields = line.split('\t')
            chr = fields[0]
            left = int(fields[3])
            right = int(fields[4])
            if fields[2] == 'exon':
                if ExcludeDict.has_key(chr):
                    pass
                else:
                    ExcludeDict[chr] = {}
                for i in range(left-2,right+3):
                    ExcludeDict[chr][i] = 1
                continue
            if fields[2] != 'CDS':
                continue
            transcriptID = fields[8].split('transcript_id "')[1].split('";')[0]
            strand = fields[6]
            if CDSDict.has_key(transcriptID):
                pass
            else:
                CDSDict[transcriptID] = []
            CDSDict[transcriptID].append((chr,left,right,strand))
        for transcriptID in CDSDict.keys():
#            sequence=''
            chr = CDSDict[transcriptID][0][0]
            strand  = CDSDict[transcriptID][0][-1]
            CDSDict[transcriptID].sort()
            Positions = []
            if strand == '+':
                for (chr,left,right,strand) in CDSDict[transcriptID]:
#                    sequence = sequence + GenomeDict[chr][left-1:right-1]
                    for i in range(left,right + 1):
                        Positions.append(i)
            if strand == '-':
                for (chr,left,right,strand) in CDSDict[transcriptID]:
#                    sequence = sequence + getReverseComplement(GenomeDict[chr][left-1:right-1])
                    for i in range(left,right + 1):
                        Positions.append(i)
            Positions.sort()
            if strand == '-':
                Positions.reverse()
#            sequence = sequence.upper()
#            sequence = sequence.replace('T','U')
#            print '.'
#            print transcriptID, len(sequence), len(Positions), CDSDict[transcriptID][0][0], CDSDict[transcriptID][0][1], CDSDict[transcriptID][-1][2]
            for i in range(0,len(Positions)-3,3):            
                codon = GenomeDict[chr][Positions[i]] + GenomeDict[chr][Positions[i+1]] + GenomeDict[chr][Positions[i+2]]
                codon.upper()
                codon.replace('T','U')
                if CodonDict4FoldDegenerate.has_key(codon):
#                    print chr, Positions[i+2],
                    if ExcludeDict[chr].has_key(Positions[i+2]):
                        del ExcludeDict[chr][Positions[i+2]]

    outfile = open(outfilename, 'w')
#    outfile.write('#chr\twindow\tGCskew\tTAskew\tGCskew_cum\tTAskew_cum\tTotalSkew\tTotalSkewcum\n')
    if doCDSEsclude:
        outfile.write('#chr\twindow\tGC%\tGCskew\tTAskew\tGCskew_cum\tTAskew_cum\tTotal_bp\n')
    else:
        outfile.write('#chr\twindow\tGC%\tGCskew\tTAskew\tGCskew_cum\tTAskew_cum\n')

    chromosomes = GenomeDict.keys()
    chromosomes.sort()

    for chr in chromosomes:
        GenomeDict[chr] = GenomeDict[chr].upper()
        length = len(GenomeDict[chr])
        GCcum = 0
        TAcum = 0
        if length <	 window:
            continue
        if doCDSEsclude:
            print chr, 'excluding:', len(ExcludeDict[chr].keys()), 'out of', len(GenomeDict[chr]), 'bases'
        for i in range(0,length,window/2):
            if (length - i) < window/2.:
                continue
            if doCDSEsclude:
                G = 0
                C = 0
                T = 0
                A = 0
                for j in range(i,min(i+window,length)):
                    if ExcludeDict.has_key(chr):
                        if ExcludeDict[chr].has_key(j+1):
                            continue
                    if GenomeDict[chr][j] == 'A':
                        A += 1
                    if GenomeDict[chr][j] == 'C':
                        C += 1
                    if GenomeDict[chr][j] == 'G':
                        G += 1
                    if GenomeDict[chr][j] == 'T':
                        T += 1
            else:
                G = GenomeDict[chr][i:min(i+window,length)].count('G')
                C = GenomeDict[chr][i:min(i+window,length)].count('C')
                A = GenomeDict[chr][i:min(i+window,length)].count('A')
                T = GenomeDict[chr][i:min(i+window,length)].count('T')
            if G + C == 0:
                GC = 'nan'
            else:
                GC = (G - C)/(G + C + 0.0)
                if doCDSEsclude:
                    GCcum += GC*((A + G + C + T + 0.0)/window)
                else:
                    GCcum += GC
            if A + T == 0:
                TA = 'nan'
            else:
                TA = (T - A)/(A + T + 0.0)
                if doCDSEsclude:
                    TAcum += TA*((A + G + C + T + 0.0)/window)
                else:
                    TAcum += TA
            if G + C + T + A == 0:
                GCperc = 'nan'
            else:
                GCperc = (G + C)/(G + C + T + A + 0.0)
#            outline = chr + '\t' + str(i+window/2)+ '\t' + str(GC) + '\t' + str(TA) + '\t' + str(GCcum) + '\t' + str(TAcum) + '\t' + str(GC + TA) + '\t' + str(GCcum + TAcum)
            outline = chr + '\t' + str(i+window/2) + '\t' + str(GCperc) + '\t' + str(GC) + '\t' + str(TA) + '\t' + str(GCcum) + '\t' + str(TAcum)
            if doCDSEsclude:
                outline = outline + '\t' + str(G + C + T + A)
            outfile.write(outline + '\n')

    outfile.close()
   
run()
