##################################
#                                #
# Last modified 2025/04/09       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n', 'r':'y', 'R':'Y', 'y':'r', 'Y':'R'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence


def run():

    if len(sys.argv) < 4:
#        print 'usage: python %s fasta seqlen maxG/C outprefix [-bed file chrFieldID leftFieldID]' % sys.argv[0]
        sys.exit(1)

    fasta = sys.argv[1]
    seqLen = int(sys.argv[2])
    maxC = int(sys.argv[3])
    outprefix = sys.argv[4]

    doBed = False
    if '-bed' in sys.argv:
        doBed = True
        bedFile = sys.argv[sys.argv.index('-bed') + 1]
        chrFieldID = int(sys.argv[sys.argv.index('-bed') + 2])
        leftFieldID = int(sys.argv[sys.argv.index('-bed') + 3])
        BedDict = {}
        lineslist = open(bedFile)
        for line in lineslist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            chr = fields[chrFieldID]
            if chr in BedDict:
                pass
            else:
                BedDict[chr] = {}
            left = int(fields[leftFieldID])
            right = int(fields[leftFieldID + 1])
            for i in range(left,right):
                BedDict[chr][i] = 1

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                if doBed:
                    if chr in BedDict:
                        GenomeDict[chr] = ''.join(sequence).upper()
                else:
                    GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    if doBed:
        if chr in BedDict:
            GenomeDict[chr] = ''.join(sequence).upper()
    else:
        GenomeDict[chr] = ''.join(sequence).upper()

    outfileCPlus = open(outprefix + '.noC-plus.bed', 'w')
    outfileCMinus = open(outprefix + '.noC-minus.bed', 'w')
    outfileGPlus = open(outprefix + '.noG-plus.bed', 'w')
    outfileGMinus = open(outprefix + '.noG-minus.bed', 'w')

    chrs = list(GenomeDict.keys())
    chrs.sort()

    for chr in chrs:
        if doBed:
            if chr in BedDict:
                pass
            else:
                continue
#        print chr
        for i in range(len(GenomeDict[chr])-seqLen):
            if doBed:
                if i in BedDict[chr]:
                    pass
                else:
                    continue
            sequence = GenomeDict[chr][i:i+seqLen]
            if sequence.count('N') > maxC:
                continue
            if sequence.count('C') <= maxC:
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t+\t' + sequence + '\t' + str(sequence.count('C'))
                outfileCPlus.write(outline + '\n')
            revComp = getReverseComplement(sequence)
            if revComp.count('C') <= maxC:
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t-\t' + revComp + '\t' + str(revComp.count('C'))
                outfileCMinus.write(outline + '\n')
            if sequence.count('G') <= maxC:
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t+\t' + sequence + '\t' + str(sequence.count('G'))
                outfileGPlus.write(outline + '\n')
            revComp = getReverseComplement(sequence)
            if revComp.count('G') <= maxC:
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t-\t' + revComp + '\t' + str(revComp.count('G'))
                outfileGMinus.write(outline + '\n')

    outfileCPlus.close()
    outfileCMinus.close()
    outfileGPlus.close()
    outfileGMinus.close()
   
run()
