##################################
#                                #
# Last modified 2025/04/16       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n', 'r':'y', 'R':'Y', 'y':'r', 'Y':'R'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence


def run():

    if len(sys.argv) < 4:
        print 'usage: python %s fasta seqlen maxG/C outprefix [-bed file chrFieldID leftFieldID] [-GCrange min max] [-lastG]' % sys.argv[0]
        sys.exit(1)

    fasta = sys.argv[1]
    seqLen = int(sys.argv[2])
    maxC = int(sys.argv[3])
    outprefix = sys.argv[4]

    doGCRange = False
    if '-GCrange' in sys.argv:
        doGCRange = True
        minGC = float(sys.argv[sys.argv.index('-GCrange') + 1])
        maxGC = float(sys.argv[sys.argv.index('-GCrange') + 2])
        print 'will require GC content to be within', minGC, 'and', maxGC

    doLastG = False
    if '-lastG' in sys.argv:
        doLastG = True
        print 'will require sequence to end with a G'

    doBed = False
    if '-bed' in sys.argv:
        doBed = True
        bedFile = sys.argv[sys.argv.index('-bed') + 1]
        chrFieldID = int(sys.argv[sys.argv.index('-bed') + 2])
        leftFieldID = int(sys.argv[sys.argv.index('-bed') + 3])
        BedDict = {}
        lineslist = open(bedFile)
        for line in lineslist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            chr = fields[chrFieldID]
            if BedDict.has_key(chr):
                pass
            else:
                BedDict[chr] = {}
            left = int(fields[leftFieldID])
            right = int(fields[leftFieldID + 1])
            for i in range(left,right):
                BedDict[chr][i] = 1

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                if doBed:
                    if BedDict.has_key(chr):
                        GenomeDict[chr] = ''.join(sequence).upper()
                else:
                    GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    if doBed:
        if BedDict.has_key(chr):
            GenomeDict[chr] = ''.join(sequence).upper()
    else:
        GenomeDict[chr] = ''.join(sequence).upper()

    outfileCPlus = open(outprefix + '.plus.bed', 'w')
    outfileCMinus = open(outprefix + '.minus.bed', 'w')
#    outfileGPlus = open(outprefix + '.noG-plus.bed', 'w')
#    outfileGMinus = open(outprefix + '.noG-minus.bed', 'w')

    chrs = GenomeDict.keys()
    chrs.sort()

    for chr in chrs:
        if doBed:
            if BedDict.has_key(chr):
                pass
            else:
                continue
        print chr
        for i in range(len(GenomeDict[chr])-seqLen):
            if doBed:
                if BedDict[chr].has_key(i):
                    pass
                else:
                    continue
            sequence = GenomeDict[chr][i:i+seqLen]
            if sequence.count('N') > maxC:
                continue
            if doGCRange:
                GCcontent = (sequence.count('C') + sequence.count('G') + 0.0)/len(sequence)
                if GCcontent >= minGC and GCcontent <= maxGC:
                    pass
                else:
                    continue
            if sequence.count('G') <= maxC:
                if doLastG:
                    if sequence.endswith('C'):
                        pass
                    else:
                        continue
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t+\t' + sequence + '\t' + str(sequence.count('C'))
                outfileCPlus.write(outline + '\n')
            revComp = getReverseComplement(sequence)
            if revComp.count('G') <= maxC:
                if doLastG:
                    if revComp.endswith('C'):
                        pass
                    else:
                        continue
                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t-\t' + revComp + '\t' + str(revComp.count('C'))
                outfileCMinus.write(outline + '\n')
#            if sequence.count('G') <= maxC:
#                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t+\t' + sequence + '\t' + str(sequence.count('G'))
#                outfileGPlus.write(outline + '\n')
#            revComp = getReverseComplement(sequence)
#            if revComp.count('G') <= maxC:
#                outline = chr + '\t' + str(i) + '\t' + str(i+seqLen) + '\t-\t' + revComp + '\t' + str(revComp.count('G'))
#                outfileGMinus.write(outline + '\n')

    outfileCPlus.close()
    outfileCMinus.close()
#    outfileGPlus.close()
#    outfileGMinus.close()
   
run()
