##################################
#                                #
# Last modified 04/27/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s list_of_HOG_fasta_files list_of_tblout_files outfilename [-HOG_renamed]' % sys.argv[0]
        print '\tHMMER tblout files should be tab-delimited'
        print '\tlist_of_fasta_files format: one file per line'
        print '\tuse the -HOG_renamed option if the HOG have been derived from the OMA-PWOtoFasta.py script'
        sys.exit(1)

    FFlist = sys.argv[1]
    TBLlist = sys.argv[2]
    outfilename = sys.argv[3]

    doRename = False
    if '-HOG_renamed' in sys.argv:
        doRename = True

    SpeciesList = []

    HOGDict={}
    SeqtoHOGDict = {}
    sequence=''
    linelist = open(FFlist)
    for fileline in linelist:
        F = fileline.split('/')[-1].split('.fa')[0]
        if HOGDict.has_key(F):
            print 'duplicate file names detected, exiting'
            print F
            sys.exit(1)
        HOGDict[F] = {}
        fasta = fileline.strip().split('\t')[0]
        inputdatafile = open(fasta)
        for line in inputdatafile:
            if line[0]=='>':
                ID = line.strip().split('>')[1]
                SeqtoHOGDict[ID] = F
                HOGDict[F][ID] = []
                if doRename:
                    species = ID.split('|')[0]
                else:
                    species = ID.split(':::')[0]
                SpeciesList.append(species)
            else:
                continue

    print 'finished inputting fasta files'

    SpeciesList = list(Set(SpeciesList))
    SpeciesList.sort()

    TBLlinelist = open(TBLlist)
    for TBLline in TBLlinelist:
        file = TBLline.strip().split('\t')[0]
        linelist = open(file)
        for line in linelist:
            if line.startswith('#') or line.strip() == '':
                continue
            fields = line.strip().split('\t')
            DBD = fields[0]
            ID = fields[2]
            if doRename:
                ID = ID.split('|')[0].replace(':::','|')
            else:
                pass
            if SeqtoHOGDict.has_key(ID):
                pass
            else:
                continue
            F = SeqtoHOGDict[ID]
            HOGDict[F][ID].append(DBD)

    print 'finished inputting tblout files'

    outfile = open(outfilename, 'w')

    outline = '#Group\tDomain'
    for species in SpeciesList:
        outline = outline + '\t' + species
    outfile.write(outline + '\n')

    for F in HOGDict.keys():
        if F == 'OMA-PWO-13':
            print HOGDict[F]
        domains = []
        SpeciesDict = {}
        for ID in HOGDict[F]:
            if doRename:
                species = ID.split('|')[0]
            else:
                species = ID.split(':::')[0]
            if SpeciesDict.has_key(species):
                pass
            else:
                SpeciesDict[species] = {}
            for DBD in HOGDict[F][ID]:
                domains.append(DBD)
                if SpeciesDict[species].has_key(DBD):
                    pass
                else:
                    SpeciesDict[species][DBD] = 0
                SpeciesDict[species][DBD] += 1
        domains = list(Set(domains))
        if len(domains) > 1:
            print 'multiple domains detected:', F, domains
        for DBD in domains:
            outline = F + '\t' + DBD
            for species in SpeciesList:
                N = 0
                if SpeciesDict.has_key(species):
                    if SpeciesDict[species].has_key(DBD):
                        N = SpeciesDict[species][DBD]
                outline = outline + '\t' + str(N)
            outfile.write(outline + '\n')

    outfile.close()

run()

