##################################
#                                #
# Last modified 2017/12/04       #
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set
import time

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','X':'X','a':'t','t':'a','g':'c','c':'g','n':'n','x':'x','R':'R','r':'r','M':'M','m':'m','Y':'Y','y':'y','S':'S','s':'s','K':'K','k':'k','W':'W','w':'w'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s input sequence_fieldID outfile [-5pG] [-ATstretchFilter bp] [-noRestrictionSites kmer1(,kmer2,...,kmerN)] [-FromOldVersion] [-minGC fraction] [-maxGC fraction]' % sys.argv[0]
        sys.exit(1)

    input = sys.argv[1]
    fieldID = int(sys.argv[2])
    outfilename = sys.argv[3]

    REkmers = []
    if '-noRestrictionSites' in sys.argv:
        kmers = sys.argv[sys.argv.index('-noRestrictionSites') + 1].split(',')
        for kmer in kmers:
            REkmers.append(kmer.upper())
            REkmers.append(getReverseComplement(kmer).upper())
        print 'will discard all guides containing the following kmers', REkmers

    mAT = 100000000
    if '-ATstretchFilter' in sys.argv:
        mAT = int(sys.argv[sys.argv.index('-ATstretchFilter') + 1])
        print 'will discard all guides with stretches of A/T longer than', mAT, 'bases'
        maxA = ''
        maxT = ''
        for i in range(mAT):
            maxA = maxA + 'A'
        for i in range(mAT):
            maxT = maxA + 'T'

    minGC = 0.0
    if '-minGC' in sys.argv:
        minGC = float(sys.argv[sys.argv.index('-minGC') + 1])
        print 'will discard all guides with GC content lower than', minGC

    maxGC = 1.0
    if '-maxGC' in sys.argv:
        maxGC = float(sys.argv[sys.argv.index('-maxGC') + 1])
        print 'will discard all guides with GC content lower than', maxGC

    do5pG = False
    if '-5pG' in sys.argv:
        do5pG = True
        print 'will require a 5p G'

    doFOV = False
    if '-FromOldVersion' in sys.argv:
        doFOV = True

    outfile = open(outfilename, 'w')

    linelist = open(input)
    for line in linelist:
        if line.startswith('#'):
            outline = line.strip()
            if doFOV:
                outline = line.strip() + '\tgRNA_sequence_reverse_complement\tGC%'
            outfile.write(outline + '\n') 
            continue
        fields = line.strip().split('\t')
        sequence = fields[fieldID]
        if do5pG and sequence[0] != 'G':
            continue
        if maxA in sequence or maxT in sequence:
            continue
        HasRE = False
        for REST in REkmers:
            if REST in sequence:
                HasRE = True
            if HasRE:
               continue
        outline = line.strip()
        GC = (sequence.count('G') + sequence.count('C'))/(len(sequence) + 0.0)
        if GC < minGC:
            continue
        if GC > maxGC:
            continue
        if doFOV:
            outline = outline + '\t' + getReverseComplement(sequence)
            outline = outline + '\t' + str(GC)
        
        outfile.write(outline + '\n')

    print 'finished filtering guides'

    outfile.close()

run()
