##################################
#                                #
# Last modified 11/14/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','X':'X','a':'t','t':'a','g':'c','c':'g','n':'n','x':'x','R':'R','r':'r','M':'M','m':'m','Y':'Y','y':'y','S':'S','s':'s'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 8:
        print 'usage: python %s fasta inputfilename radius chrFieldID leftFieldIF rightFieldID strandFieldID outprefix [-BT biotype biotypeFieldID] [-skipMissingChr]' % sys.argv[0]
        sys.exit(1)

    fasta = sys.argv[1]
    inputfilename = sys.argv[2]
    radius = int(sys.argv[3])
    chrFieldID = int(sys.argv[4])
    leftFieldID = int(sys.argv[5])
    rightFieldID = int(sys.argv[6])
    strandFieldID = int(sys.argv[7])
    outprefix = sys.argv[8]

    doBT = False
    if '-BT' in sys.argv:
        doBT = True
        BT = sys.argv[sys.argv[4].index('-BT')+1]
        BTFieldID = int(sys.argv[sys.argv[4].index('-BT')+2])

    doSkipMissingChr = False
    if '-skipMissingChr' in sys.argv:
        doSkipMissingChr = True

    inputdatafile = open(fasta)
    chr = ''
    SequenceDict = {}
    for line in inputdatafile:
        if line[0]=='>':
            if chr != '':
                sequence = ''.join(sequence)
                SequenceDict[chr] = sequence
            chr = line.strip().split('>')[1]
            print chr
            sequence=[]
        else:
            sequence.append(line.strip())
    sequence = ''.join(sequence)
    SequenceDict[chr] = sequence

    outfile3 = open(outprefix + '.acceptor.fa', 'w')
    outfile5 = open(outprefix + '.donor.fa', 'w')
    
    inputdatafile = open(inputfilename)
    for line in inputdatafile:
        if line[0]=='#':
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        if doSkipMissingChr:
            if SequenceDict.has_key(chr):
                pass
            else:
                continue
        left = int(fields[leftFieldID])
        right = int(fields[rightFieldID])
        strand = fields[strandFieldID]
        if doBT:
            if fields[BTFieldID] != BT:
                continue
        if strand == '+':
            sequence5 = SequenceDict[chr][left-radius:left+radius]
            sequence3 = SequenceDict[chr][right-1-radius:right-1+radius]
        if strand == '-':
            sequence3 = getReverseComplement(SequenceDict[chr][left-radius:left+radius])
            sequence5 = getReverseComplement(SequenceDict[chr][right-1-radius:right-1+radius])
#        print fields
#        print sequence5, sequence3
        outline = '>'
        for f in fields:
            outline = outline + f + '::'
        outfile3.write(outline[0:-2] + '\n')
        outfile5.write(outline[0:-2] + '\n')
        outfile3.write(sequence3 + '\n')
        outfile5.write(sequence5 + '\n')
        
    outfile3.close()        
    outfile5.close()        

run()
