##################################
#                                #
# Last modified 2019/03/18       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import os
from sets import Set

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T',
           'T':'A',
           'G':'C',
           'C':'G',
           'N':'N',
           'X':'N',
           'a':'t',
           't':'a',
           'g':'c',
           'c':'g',
           'n':'N',
           'x':'N',
           'R':'N',
           'r':'N',
           'M':'N',
           'm':'N',
           'Y':'N',
           'y':'N',
           'S':'N',
           's':'N',
           'K':'N',
           'k':'N',
           'W':'N',
           'w':'N',
           'H':'N',
           'h':'N',
           'V':'N',
           'v':'N',
           'B':'N',
           'b':'N',
           'D':'N',
           'd':'N'
            }
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s fasta guides radius outputfilename [-offset bp] [-highlightMiddle]' % sys.argv[0]
        print '\tguides input format:' 
        print '\tchr19:40398966-40398986__chr19:40398974-40398996:-,48,0.17726806,15;2:1|3:14,GCGGGGATAAGGTCATGGGG' 
        sys.exit(1)
    
    fasta = sys.argv[1]
    guides = sys.argv[2]
    radius = int(sys.argv[3])
    outfilename = sys.argv[4]

    OS = 0
    doOffset = False
    if '-offset' in sys.argv:
        doOffset = True
        OS = int(sys.argv[sys.argv.index('-offset') + 1])
        print 'will shift cut sites regions by', OS, 'bp'

    doHM = False
    if '-highlightMiddle' in sys.argv:
        print 'will highlight the middle position'
        doHM = True

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            print chr
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    GuideDict = {}

    if guides.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + guides
    elif guides.endswith('.gz'):
        cmd = 'zcat ' + guides
    else:
        cmd = 'cat ' + guides
    p = os.popen(cmd, "r")
    LC = 0
    line = 'line'
    while line != '':
        line = p.readline()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split(',')
        chr = fields[0].split(':')[0]
        regionleft = int(fields[0].split(':')[1].split('_')[0].split('-')[0])
        regionright = int(fields[0].split(':')[1].split('_')[0].split('-')[1])
        left = int(fields[0].split(':')[-2].split('-')[0])
        right = int(fields[0].split(':')[-2].split('-')[1])
        strand = fields[0][-1]
        if strand == '+':
            cut = right - 3 - 3 + OS
        if strand == '-':
            cut = left + 3 + 3 - OS
        CFD = fields[2]
        region = (chr,regionleft,regionright)
        if GuideDict.has_key(region):
            pass
        else:
            GuideDict[region] = {}
            GuideDict[region]['+'] = []
            GuideDict[region]['-'] = []
        GuideDict[region][strand].append((CFD,cut,left,right))

    outfile = open(outfilename,'w')

    regions = GuideDict.keys()
    regions.sort()

    for region in regions:
         (chr,regionleft,regionright) = region
         outline = '###################################'
         outfile.write(outline+'\n')
         outline = '###################################'
         outfile.write(outline+'\n')
         outline = chr + ':' + str(regionleft) + '-' + str(regionright)
         outfile.write(outline+'\n')
         NL = regionleft - radius
         NR = regionright + radius
         sequence = GenomeDict[chr][NL:NR]
         MP = int((NR + NL)/2)
         if doHM:
             MPos = int((NR - NL)/2)
             sequence = sequence.lower()
             sequence = sequence[0:MPos-1] + sequence[MPos-1].upper() + sequence[MPos:]
         revcomp = getReverseComplement(sequence)
         GuideDict[region]['+'].sort()
         GuideDict[region]['-'].sort()
         for (CFD,cut,left,right) in GuideDict[region]['+']:
             outline = ''
             for i in range(left - NL - 1):
                 outline += '.'
             outline += '5p-'
             guide = GenomeDict[chr][left - 1:right - 4].upper()
             outline += guide
             outline += '-3p'
             for i in range(NR - right + 3 + 1):
                 outline += '.'
             outfile.write(outline + '\t' + str(CFD)[0:4] + '\t' + str(cut - MP) + '\n')
         outline = '5p-' + sequence + '-3p'
         outfile.write(outline + '\tCFD\tDistance_to_center' + '\n')
         outline = '3p-' + revcomp[::-1] + '-5p'
         outfile.write(outline + '\tCFD\tDistance_to_center' + '\n')
         for (CFD,cut,left,right) in GuideDict[region]['-']:
             outline = ''
             for i in range(left - NL + 2):
                 outline += '.'
             outline += '3p-'
             guide = getReverseComplement(GenomeDict[chr][left + 2:right - 1]).upper()
             outline += guide[::-1]
             outline += '-5p'
             for i in range(NR - right + 1):
                 outline += '.'
             outfile.write(outline + '\t' + str(CFD)[0:4] + '\t' + str(cut - MP) + '\n')

    outfile.close()
   
run()
