##################################
#                                #
# Last modified 2017/01/02       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import copy
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s config perfect_match_thrshold ?match_threshold outfile_prefix' % sys.argv[0]
        print '\tNote: config file format: ID <tab> PFam-27-A.tab'
        print '\tNote: if you do not want any ? matches, set the two p-values cutoffs to the same value'
        sys.exit(1)

    config = sys.argv[1]
    ev1 = float(sys.argv[2])
    ev2 = float(sys.argv[3])
    outprefix = sys.argv[4]

    domainMatchDict = {}
    domainDict = {}
    
    linelist1 = open(config)
    for line1 in linelist1:
        if line1.startswith('#'):
            continue
        fields1 = line1.strip().split('\t')
        species = fields1[0]
        PFAM = fields1[1]
        print species, PFAM
        linelist = open(PFAM)
        if domainMatchDict.has_key(species):
            print 'duplicated species detected, exiitng'
            sys.exit(1)
        domainMatchDict[species] = {}
        for line in linelist:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            domain = fields[2]
            E = float(fields[6])
            if domainMatchDict[species].has_key(domain):
                domainMatchDict[species][domain] = min(E,domainMatchDict[species][domain])
            else:
                domainMatchDict[species][domain] = E
            domainDict[domain] = 1

    domains = domainDict.keys()
    domains.sort()

    outfile = open(outprefix + '.domain_order', 'w')

    i=0
    for domain in domains:
        outline = str(i) + '\t' + domain
        outfile.write(outline + '\n')
        i+=1

    outfile.close()

    outfile = open(outprefix + '.fa', 'w')

    speciesList = domainMatchDict.keys()
    speciesList.sort()

    for species in speciesList:
        outline = '>' + species
        outfile.write(outline + '\n')
        outline = ''
        for domain in domains:
            if domainMatchDict[species].has_key(domain):
                if domainMatchDict[species][domain] <= ev1:
                    outline = outline + '1'
                elif domainMatchDict[species][domain] > ev1 and domainMatchDict[species][domain] < ev2:
                    outline = outline + '?'
                else:
                    outline = outline + '0'
            else:
                outline = outline + '0'
        outfile.write(outline + '\n')

    outfile.close()

   
run()
