##################################
#                                #
# Last modified 2019/05/08       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import gzip
from sets import Set

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s PFam32-A.tab domains domains_file_fieldID max_c-Evalue output'
        print '\tuse - for stdin'
        print '\tthe script can read .gz files'
        sys.exit(1)

    input = sys.argv[1]
    domainsfile = sys.argv[2]
    dFieldID = int(sys.argv[3])
    maxE = float(sys.argv[4])
    outfilename = sys.argv[5]

    DomainDict = {}

    if domainsfile == '-':
        lineslist  = sys.stdin
    elif domainsfile.endswith('.gz'):
        lineslist  = gzip.open(domainsfile)
    else:
        lineslist  = open(domainsfile)
    for line in lineslist:
        if line.startswith('#'):
            continue
        if line.strip() == '':
            continue
        fields = line.strip().split('\t')
        DomainDict[fields[dFieldID]] = 1

    ProteinDict = {}
 
    if input == '-':
        lineslist  = sys.stdin
    elif input.endswith('.gz'):
        lineslist  = gzip.open(input)
    else:
        lineslist  = open(input)
    for line in lineslist:
        if line.startswith('#'):
            continue
        if line.strip() == '':
            continue
        fields = line.strip().split('\t')
        cEval = float(fields[6])
        if cEval > maxE:
            continue
        protein = fields[0]
        domain = fields[2]
        if DomainDict.has_key(fields[2]):
            pass
        else:
            continue
        if ProteinDict.has_key(protein):
            pass
        else:
            ProteinDict[protein] = {}
        if ProteinDict[protein].has_key(domain):
            pass
        else:
            ProteinDict[protein][domain] = 0
        ProteinDict[protein][domain] += 1

    outfile = open(outfilename,'w')

    domains = DomainDict.keys()
    domains.sort()

    outline = '#protein'
    for domain in domains:
        outline += '\t'
        outline += domain
    outfile.write(outline + '\n')

    for protein in ProteinDict.keys():
        outline = protein
        for domain in domains:
            if ProteinDict[protein].has_key(domain):
                outline = outline + '\t' + str(ProteinDict[protein][domain])
            else:
                outline = outline + '\t0'
        outfile.write(outline + '\n')

    outfile.close()

run()