##################################
#                                #
# Last modified 11/07/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s fasta chunk_size(bp) outprefix' % sys.argv[0]
        print 'Note: the script will not split chromosomes larger than the chunk size into smaller pieces, it will output at least one whole chromsome per file'
        sys.exit(1)

    fasta = sys.argv[1]
    chunk = int(sys.argv[2])
    outprefix = sys.argv[3]

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    print 'finished inputting sequence'
    print 'chunk size = ', chunk

    LengthList = []
    for chr in GenomeDict.keys():
        LengthList.append((len(GenomeDict[chr]),chr))

    LengthList.sort()
    LengthList.reverse()

    blocksize = 100
    i=0
    currentSize = 0
    for (L,chr) in LengthList:
        if i == 0 and currentSize == 0:
            outfilename = outprefix + str(i+1) + '.fa'
            outfile = open(outfilename, 'w')
        outline = '>' + chr
        outfile.write(outline + '\n')
        sequence = GenomeDict[chr]
        for j in range(0,len(sequence),blocksize):
            outfile.write(sequence[j:min(j+blocksize, len(sequence))] + '\n')
        currentSize += L
        if currentSize >= chunk:
            outfile.close()
            print i, currentSize
            i+=1
            outfilename = outprefix + str(i+1) + '.fa'
            outfile = open(outfilename, 'w')
            currentSize = 0

    outfile.close()
   
run()
