##################################
#                                #
# Last modified 2025/04/15       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set
import os

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s bismark.cov context strand genome.fasta [-0-based]' % sys.argv[0]
        print "\tNote: it is assumed the sequence context begins or ends at the positions in the coverage file"
        print "\tthe script accepts .gz and .bz2 files"
        print "\tthe script will print to stdout"
        sys.exit(1)

    doOB = False
    if '-0-based' in sys.argv:
        doOB = True

    input = sys.argv[1]
    WantedCG = sys.argv[2]
    revWantedCG = getReverseComplement(WantedCG)
    strand = sys.argv[3]
    fasta = sys.argv[4]
    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    p=0
    i=0
    if input.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + input
    elif input.endswith('.gz') or input.endswith('.bgz'):
        cmd = 'gunzip -c ' + input
    else:
        cmd = 'cat ' + input
    p1 = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p1.readline()
        if line.startswith('#') or line.startswith('track ') or line.strip() == '':
            print line.strip()
            continue
        fields = line.strip().split('\t')
        chr = fields[0]
        if doOB:
            start = int(fields[1])
        else:
            start = int(fields[1]) - 1
        if strand == '+' and GenomeDict[chr][start:start+len(WantedCG)] == WantedCG:
            print line.strip()
        if strand == '-' and GenomeDict[chr][start:start+len(WantedCG)] == revWantedCG:
            print line.strip()

run()
