##################################
#                                #
# Last modified 06/24/2012       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import random

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s fasta readLength Coverage fraction_tiles outputfilename' % sys.argv[0]
        print '\tif a contig is shorter than the read length, its full length will be outputted'
        print '\tby default the minimum overlap between reads will be half the read length;'
        print '\tfraction of tiles corresponds to the fraction of reads that will be tiles with a step equal or greater than the minimum overlap; the rest will be randomly chosen subsequences'
        sys.exit(1)

    fasta = sys.argv[1]
    readLength = int(sys.argv[2])
    minOverlap = int(readLength/2)
    coverage = int(sys.argv[3])
    tiles_fraction = float(sys.argv[4])
    outfilename = sys.argv[5]

    outfile = open(outfilename, 'w')

    inputdatafile = open(fasta)
    ID=''
    r=0
    for line in inputdatafile:
        if line[0]=='>':
            if ID == '':
                ID = line.strip().split('>')[1]
            else:
                sequence = ''.join(sequence)
                if len(sequence) <= readLength:
                    for i in range(coverage):
                        r+=1
                        outfile.write('>read' + str(r) +  '\n')
                        outfile.write(sequence + '\n')
                else:
                    for i in range(int(coverage/(1.0/tiles_fraction)+1)):
                        for j in range(0,len(sequence)-minOverlap,minOverlap):
                            read = sequence[min(j,len(sequence)-readLength):min(j,len(sequence)-readLength)+readLength]
                            r+=1
                            outfile.write('>read' + str(r) +  '\n')
                            outfile.write(read + '\n')
                    for i in range(int((len(sequence)/(readLength+0.0))*(coverage - coverage/(1.0/tiles_fraction))+1)):
                        j = random.randint(0,len(sequence)-readLength)
                        read = sequence[j:j+readLength]
                        r+=1
                        outfile.write('>read' + str(r) +  '\n')
                        outfile.write(read + '\n')
                ID = line.strip().split('>')[1]
            sequence=[]
        else:
            sequence.append(line.strip())   

    sequence = ''.join(sequence)
    if len(sequence) <= readLength:
        for i in range(coverage):
            r+=1
            outfile.write('>read' + str(r) +  '\n')
            outfile.write(sequence + '\n')
    else:
        for i in range(int(coverage/(1.0/tiles_fraction)+1)):
            for j in range(0,len(sequence)-minOverlap,minOverlap):
                read = sequence[min(j,len(sequence)-readLength):min(j,len(sequence)-readLength)+readLength]
                r+=1
                outfile.write('>read' + str(r) +  '\n')
                outfile.write(read + '\n')
        for i in range(int((len(sequence)/(readLength+0.0))*(coverage - coverage/(1.0/tiles_fraction))+1)):
            j = random.randint(0,len(sequence)-readLength)
            read = sequence[j:j+readLength]
            r+=1
            outfile.write('>read' + str(r) +  '\n')
            outfile.write(read + '\n')

    outfile.close()
   
run()
