##################################
#                                #
# Last modified 04/29/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s fasta tblout output [-collapseDups] [-proteinFilter] [-startM]' % sys.argv[0]
        print '\tNote: the script will split the FASTA IDs by space'
        print '\tNote: the -proteinFilter option will remove all sequences that start or end with an X'
        print '\tNote: the -startM option works only when the -proteinFilter is on, and will remove all sequences that do not begin with an M'
        sys.exit(1)

    fasta = sys.argv[1]
    tblout = sys.argv[2]
    outfilename = sys.argv[3]

    doCollapse = False
    if '-collapseDups' in sys.argv:
        doCollapse = True

    doProteinFilter = False
    if '-proteinFilter' in sys.argv:
        doProteinFilter = True
        doM = False
        if '-startM' in sys.argv:
            doM = True

    SeqDict = {}

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                if doCollapse:
                    if SeqDict.has_key(''.join(sequence)):
                        pass
                    else:
                        GenomeDict[chr] = ''.join(sequence)
                        SeqDict[''.join(sequence)] = 1
                else:
                    GenomeDict[chr] = ''.join(sequence)
            chr = line.strip().split('>')[1].split(' ')[0]
            sequence=[]
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence)

    outfile = open(outfilename,'w')

    SeenDict = {}

    inputdatafile = open(tblout)
    for line in inputdatafile:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        ID = fields[2]
        if SeenDict.has_key(ID):
            continue
        else:
            SeenDict[ID] = 1
        if GenomeDict.has_key(ID):
            sequence = GenomeDict[ID]
            if doProteinFilter:
                if sequence.startswith('X') or sequence.endswith('X'):
                    continue
                if doM:
                    if sequence.startswith('M'):
                        pass
                    else:
                        continue
            outline = '>' + ID
            outfile.write(outline + '\n')
            for i in range(0,len(sequence),100):
                outfile.write(sequence[i:min(i+100, len(sequence))] + '\n')

    outfile.close()

run()

