##################################
#                                #
# Last modified 04/26/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s PairwiseOrthologs_list list_of_fasta_files outfile_prefix' % sys.argv[0]
        print '\tPairwiseOrthologs_list format: one file per line'
        print '\tlist_of_fasta_files format: one file per line'
        sys.exit(1)

    POlist = sys.argv[1]
    FFlist = sys.argv[2]
    outprefix = sys.argv[3]

    FastaDict={}
    sequence=''
    linelist = open(FFlist)
    for fileline in linelist:
        fasta = fileline.strip().split('\t')[0]
        inputdatafile = open(fasta)
        for line in inputdatafile:
            if line[0]=='>':
                if sequence != '':
                    FastaDict[chr] = ''.join(sequence)
                chr = line.strip().split('>')[1]
                sequence=[]
                continue
            else:
                sequence.append(line.strip())
        FastaDict[chr] = ''.join(sequence)

    HOGDict = {}
    POlinelist = open(POlist)
    for POline in POlinelist:
        file = POline.strip().split('\t')[0]
        linelist = open(file)
        for line in linelist:
            if line.startswith('#') or line.strip() == '':
                continue
            fields = line.strip().split('\t')
            O1 = fields[2]
            O2 = fields[3]
            if HOGDict.has_key(O1):
                pass
            else:
                HOGDict[O1] = {}
            if HOGDict.has_key(O2):
                pass
            else:
                HOGDict[O2] = {}
            HOGDict[O2][O1] = 1
            HOGDict[O1][O2] = 1

    GroupList = []
    for O1 in HOGDict.keys():
        proteins = []
        proteins.append(O1)
        for O2 in HOGDict[O1].keys():
            proteins.append(O2)
            for O3 in HOGDict[O2].keys():
                proteins.append(O3)
                for O4 in HOGDict[O3].keys():
                    proteins.append(O4)
                    for O5 in HOGDict[O4].keys():
                        proteins.append(O5)
        proteins = Set(proteins)
#        for p in proteins:
#            if 'PBIGNG05576' in p:
#                print '=========================='
#                print proteins
#                t = proteins
#                t = list(t)
#                t.sort()
#                print '----------------------'
#                print len(proteins), len(t), t
        GroupList.append(proteins)

    GroupList = list(Set(GroupList))
    GroupList.sort()

    for i in range(len(GroupList)):
        i+=1
        outfile = open(outprefix + str(i) + '.fa', 'w')
        proteins = list(GroupList[i-1])
        proteins.sort()
        for O in proteins:
            species = O.split('_')[0]
            gene = O.split('|')[0].split(':::')[1]
            outline = '>' + O.split(':::')[0] + '|' + gene
#            outline = '>' + O.replace(':::','|',).replace(species+'_',species[0] + '.').replace(',','-')
            outfile.write(outline + '\n')
            outfile.write(FastaDict[O] + '\n')
            outfile.write('\n')
        outfile.close()

run()

