##################################
#                                #
# Last modified 03/11/2016       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import random

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T',
           'T':'A',
           'a':'t',
           't':'a',
           'G':'C',
           'C':'G',
           'g':'c',
           'c':'g',
           'N':'N',
           'n':'n',
           'W':'S',
           'S':'W',
           'w':'s',
           's':'w',
           'M':'K',
           'K':'M',
           'm':'k',
           'k':'m',
           'R':'Y',
           'Y':'R',
           'r':'y',
           'y':'r',
           'B':'B',
           'D':'D',
           'H':'H',
           'V':'V',
           'b':'b',
           'd':'d',
           'h':'h',
           'v':'v',}

    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s genome_fasta readlength outifle [-sample N] [-bothStrands] [-chr chrN1(,chrN2....)]' % sys.argv[0]
        print '\tuse - instead of a filename to indicated printing to stdout'
        sys.exit(1)

    genome = sys.argv[1]
    readlength = int(sys.argv[2])
    outputfilename = sys.argv[3]

    doChrSubset=False
    if '-chr' in sys.argv:
        doChrSubset=True
        WantedChrDict={}
        for chr in sys.argv[sys.argv.index('-chr')+1].split(','):
            WantedChrDict[chr]=''

    doSample = False
    if '-sample' in sys.argv:
        doSample = True
        N = int(sys.argv[sys.argv.index('-sample') + 1])

    doStdOut = False
    if outputfilename == '-':
        doStdOut = True

    doBS = False
    if '-bothStrands' in sys.argv:
        doBS = True

    SeqDict={}
    lineslist = open(genome)
    currentChr=''
    i=0
    for line in lineslist:
        i+=1
        if i % 1000000 == 0:
            if doStdOut:
                pass
            else:
                print i, 'lines processed'
        if line.startswith('>'):
            if currentChr != '':
                SeqDict[currentChr]=sequence
            currentChr=line.strip().split('>')[1]
            sequence=''
        else:
            sequence+=line.strip()

    SeqDict[currentChr]=sequence

    TS = 0.
    for chr in SeqDict.keys():
        TS += len(SeqDict[chr])
    if doBS:
        TS = 2*TS

    if doSample:
        p = N/TS
    else:
        p = 1

    keys=SeqDict.keys()
    keys.sort()

    if doStdOut:
        pass
    else:
        outfile = open(outputfilename, 'w')

    for chr in keys:
        if doChrSubset:
            if WantedChrDict.has_key(chr):
                pass
            else:
                continue
        if doStdOut:
            pass
        else:
            print chr
        for i in range(len(SeqDict[chr])-readlength+1):
            read=SeqDict[chr][i:i+readlength]
            if doSample:
                sp = random.random()
                if sp > p:
                    continue
            if doStdOut:
                print '>' + chr + ':' + str(i) + '-' + str(i+readlength)
                print read
            else:
                outfile.write('>'+chr+':'+str(i)+'-'+str(i+readlength)+'\n')
                outfile.write(read+'\n')
        if doBS:
            revseq = getReverseComplement(SeqDict[chr])
            for i in range(len(revseq)-readlength+1):
                read=revseq[i:i+readlength]
                if doSample:
                    sp = random.random()
                    if sp > p:
                        continue
                if doStdOut:
                    print '>' + chr + '-rev:' + str(i) + '-' + str(i+readlength)
                    print read
                else:
                    outfile.write('>' + chr + '-rev:' + str(i) + '-' + str(i+readlength) + '\n')
                    outfile.write(read+'\n')

    if doStdOut:
        pass
    else:
        outfile.close()

run()

