##################################
#                                #
# Last modified 02/05/2013       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s  list_of_genes gene_fieldID funcassociate_go_associations.txt funcassociate_results.tsv outputfilename' % sys.argv[0]
        sys.exit(1)

    genes = sys.argv[1]
    geneFieldID = int(sys.argv[2])
    assoications= sys.argv[3]
    results = sys.argv[4]
    outfile = open(sys.argv[5], 'w')

    GeneDict = {}

    linelist = open(genes)
    for line in linelist:
        if line[0]=='#':
            continue
        fields=line.strip().split('\t')
        gene = fields[geneFieldID].upper()
        GeneDict[gene]=''

    EnrichedGOTermsDict={}

    linelist = open(results)
    for line in linelist:
        if line.startswith('#') or line.startswith('OVERREPRESENTED ATTRIBUTES') or line.startswith('N\tX\tLOD\tP\tP_adj\tattrib ID\tattrib name'):
            continue
        fields = line.strip().split('\t')
        GO = (fields[5],fields[6])
        EnrichedGOTermsDict[GO]=''

    linelist = open(assoications)
    i=0
    for line in linelist:
        i+=1
        if line.startswith('#') or line.startswith('OVERREPRESENTED ATTRIBUTES') or line.startswith('N\tX\tLOD\tP\tP_adj\tattrib ID\tattrib name'):
            continue
        fields = line.strip().split('\t')
        try:
            GO = (fields[0],fields[1])
        except:
            print 'skipping line', i
            continue
        if EnrichedGOTermsDict.has_key(GO):
            genes = fields[2].split(' ')
            outline = fields[0] + '\t' + fields[1] + '\t'
            for gene in genes:
                if GeneDict.has_key(gene.upper()):
                    outline = outline + gene + ' '
            outfile.write(outline.strip() + '\n')
        
    outfile.close()

run()
