##################################
#                                #
# Last modified 2017/03/13       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s fasta regionsFile chrFieldID leftFieldID rightFieldID outputfilename' % sys.argv[0]
        print '\tNote: 1-based coordinates are assumed for the regions file'
        sys.exit(1)

    fasta = sys.argv[1]
    regions = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    outfilename = sys.argv[6]

    blocksize = 100

    CovDict = {}
    inputdatafile = open(regions)
    for line in inputdatafile:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        left = int(fields[leftFieldID])
        right = int(fields[rightFieldID])
        if CovDict.has_key(chr):
            pass
        else:
            CovDict[chr] = {}
        for i in range(left,right):
            CovDict[chr][i-1] = 0

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            print chr
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    outfile = open(outfilename, 'w')

    for chr in GenomeDict.keys():
        outline = '>' + chr
        outfile.write(outline + '\n')
        if CovDict.has_key(chr):
            sequence = ''
            for i in range(0,len(GenomeDict[chr])):
                if CovDict[chr].has_key(i):
                    sequence += 'N'
                else:
                    sequence += GenomeDict[chr][i]
        else:
            sequence = GenomeDict[chr]
        for i in range(0,len(sequence),blocksize):
            outfile.write(sequence[i:min(i+blocksize, len(sequence))] + '\n')
        
    outfile.close()
   
run()
