##################################
#                                #
# Last modified 2017/09/10       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s fasta outpurfilename' % sys.argv[0]
        sys.exit(1)

    fasta = sys.argv[1]
    outfilename = sys.argv[2]

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    chromosomes = GenomeDict.keys()
    chromosomes.sort()

    DNDict = {}
    DNtotal = 0.0
 
    for chr in chromosomes:
        if '|ref|YP' in chr or '|ref|NP' in chr:
            continue
        print chr
        GenomeDict[chr] = GenomeDict[chr].upper()
        length = len(GenomeDict[chr])
        for i in range(length - 1):
            DN = GenomeDict[chr][i:i+2]
            if DNDict.has_key(DN):
                pass
            else:
                DNDict[DN] = 0
            DNDict[DN] += 1
            DNtotal += 1

    DNs = DNDict.keys()
    DNs.sort()

    outfile = open(outfilename, 'w')
    outline = '#dinucleotide\tfrequency'  
    outfile.write(outline + '\n')

    for DN in DNs:
        outline = DN + '\t' + str(DNDict[DN]) + '\t' + str(DNDict[DN]/DNtotal)
        outfile.write(outline + '\n')

    outfile.close()
   
run()
