##################################
#                                #
# Last modified 2021/06/09       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set
import os

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s bismark.cov context genome.fasta [-0-based] [-exclude context bp_before bp_after]' % sys.argv[0]
        print "\tNote: it is assumed the sequence context begins or ends at the positions in the coverage file"
        print "\tthe script accepts .gz and .bz2 files"
        print "\tthe script will print to stdout"
        sys.exit(1)

    doExclude = False
    if '-exclude' in sys.argv:
        doExclude = True
        ExCont = sys.argv[sys.argv.index('-exclude') + 1]
        revExCont = getReverseComplement(ExCont)
        ECBefore = int(sys.argv[sys.argv.index('-exclude') + 2])
        ECAfter = int(sys.argv[sys.argv.index('-exclude') + 3])
#        print 'will exclude the', ExCont, 'context', ECBefore, ECAfter

    doOB = False
    if '-0-based' in sys.argv:
        doOB = True

    input = sys.argv[1]
    WantedCG = sys.argv[2]
    revWantedCG = getReverseComplement(sys.argv[2])
    fasta = sys.argv[3]
    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    p=0
    i=0
    if input.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + input
    elif input.endswith('.gz'):
        cmd = 'gunzip -c ' + input
    else:
        cmd = 'cat ' + input
    p1 = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p1.readline()
        if line.startswith('#') or line.startswith('track ') or line.strip() == '':
            print line.strip()
            continue
        fields = line.strip().split('\t')
        chr = fields[0]
        if doOB:
            start = int(fields[1])
        else:
            start = int(fields[1]) - 1
#        print chr, fields[1], start, start+len(WantedCG), GenomeDict[chr][start-2:start+2], GenomeDict[chr][start:start+len(WantedCG)], GenomeDict[chr][start-len(WantedCG)+1:start+1]
        if GenomeDict[chr][start:start+len(WantedCG)] == WantedCG or GenomeDict[chr][start-len(WantedCG)+1:start+1] == WantedCG or GenomeDict[chr][start:start+len(WantedCG)] == revWantedCG or GenomeDict[chr][start-len(WantedCG)+1:start+1] == revWantedCG:
            if doExclude:
                if GenomeDict[chr][start - ECBefore:start + len(WantedCG) + ECAfter] == ExCont or GenomeDict[chr][start-len(WantedCG)+1 - ECBefore:start+1+ECAfter] == ExCont or GenomeDict[chr][start - ECAfter:start + len(WantedCG) + ECBefore] == revExCont or GenomeDict[chr][start-len(WantedCG)+1 - ECAfter:start + 1 + ECBefore] == revExCont:
                    continue
            print line.strip()

run()
