##################################
#                                #
# Last modified 11/01/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s query.fa target.fa blast_output maxEValue outfile [-minPercIdent percent] [-minFractionQLen fraction] [-minFractionTLen fraction] [-minTFractionORminQFraction]' % sys.argv[0]
        print 'Note: the -minTFractionORminQFraction option will require either the query or the target alignment to exceed the indicated fractions in the -minFractionTLen and -minFractionQLen options, but will not require both to do so'
        sys.exit(1)

    query = sys.argv[1]
    target = sys.argv[2]
    blast = sys.argv[3]
    maxE = float(sys.argv[4])
    outfilename = sys.argv[5]

    doBothMinAlign = False

    doMinQAlign = False
    if '-minFractionQLen' in sys.argv:
        doMinQAlign = True
        minQAlign = float(sys.argv[sys.argv.index('-minFractionQLen') + 1])

    doMinTAlign = False
    if '-minFractionTLen' in sys.argv:
        doMinTAlign = True
        minTAlign = float(sys.argv[sys.argv.index('-minFractionTLen') + 1])

    if doMinTAlign and doMinQAlign:
        if '-minTFractionORminQFraction' in sys.argv:
            doBothMinAlign = True

    doMinPercIden = False
    if '-minPercIdent' in sys.argv:
        doMinPercIden = True
        minPercIdent = float(sys.argv[sys.argv.index('-minPercIdent') + 1])

    QueryDict={}
    sequence=''
    inputdatafile = open(query)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                QueryDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    QueryDict[chr] = ''.join(sequence)

    TargetDict={}
    sequence=''
    inputdatafile = open(target)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                TargetDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    TargetDict[chr] = ''.join(sequence)

    MatchDict = {}
    for seq in QueryDict.keys():
        MatchDict[seq] = 0

    linelist = open(blast)
    for line in linelist:
        if line.startswith('#') or line.strip() == '':
            continue
        fields = line.strip().split('\t')
        EValue = float(fields[10])
        if EValue > maxE:
            continue
        q = fields[0]
        t = fields[1]
        if doMinQAlign:
            A = float(fields[3])
            qL = len(QueryDict[q])
        if doMinTAlign:
            A = float(fields[3])
            tL = len(TargetDict[t])
        if q == 'FIG00354963:_hypothetical_protein|contig_1|257847,258185|-':
            PI = float(fields[2])
            print PI, minPercIdent, A, qL, tL, A/qL, A/tL
        if doBothMinAlign:
            if A/qL < minQAlign and A/tL < minTAlign:
                continue
        else:
            if A/qL < minQAlign:
                continue
            if A/tL < minTAlign:
                continue
        if doMinPercIden:
            PI = float(fields[2])
            if PI < minPercIdent:
                continue
        MatchDict[q] = 1

    outfile = open(outfilename,'w')
    outline = '#gene\tMatchOrNot'
    outfile.write(outline + '\n')

    genes = MatchDict.keys()
    genes.sort()
    for q in genes:
        outline = q + '\t' + str(MatchDict[q])
        outfile.write(outline + '\n')
    outfile.close()

run()

