##################################
#                                #
# Last modified 03/15/2016       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s config outfileprefix' % sys.argv[0]
        print '\tConfig format: label <tab> fasta_file'
        print '\tCfasta headers are supposed to be uniform across files, i.e the same genes is named the same in all of them'
        print '\tNote that the script will split by | and : characters and only use the first resulting field as a gene name'
        sys.exit(1)

    config = sys.argv[1]
    outprefix = sys.argv[2]

    FilesDict = {}

    inputdatafile = open(config)
    for line in inputdatafile:
        fields = line.strip().split('\t')
        label = fields[0]
        file = fields[1]
        FilesDict[label] = file

    SeqDict={}
    sequence=''
    for label in FilesDict.keys():
        fasta = FilesDict[label]
        sequence=''
        inputdatafile = open(fasta)
        for line in inputdatafile:
            if line[0]=='>':
                if sequence != '':
                    if SeqDict.has_key(gene):
                        pass
                    else:
                        SeqDict[gene] = {}
                    SeqDict[gene][label] = ''.join(sequence)
                gene = line.strip().split('>')[1].split('|')[0].split(':')[0].replace('/','_')
                sequence=[]
                Keep=False
                continue
            else:
                sequence.append(line.strip())
        if SeqDict.has_key(gene):
            pass
        else:
            SeqDict[gene] = {}
        SeqDict[gene][label] = ''.join(sequence)

    for gene in SeqDict.keys():
        outfile = open(outprefix + '.' + gene + '.faa', 'w')
        for label in SeqDict[gene].keys():
            outline = '>' + gene + '|' + label
            outfile.write(outline + '\n')
            outfile.write(SeqDict[gene][label] + '\n')
        outfile.close()
   
run()
