##################################
#                                #
# Last modified 09/21/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from sets import Set

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s  HMD_HumanSequence.rpt mouseTable fields_to_keep humanTable fields_to_kewp outfilename' % sys.argv[0]
        print '       Note: this script is wirtten for the HMD_HumanSequence.rpt file in the following format:'
        print '       Aars    MGI:2384560     234734  AARS    16      NM_146217       NM_001605       NP_666329       NP_001596       Q8BGQ7  P49588          NT,AA   J:90500,J:58000'
        print '       The script assumes a header line'
        print '       field IDs should be comma-separated'
        sys.exit(1)

    datafilename = sys.argv[1]
    mouseTable = sys.argv[2]
    mouseFields=[]
    fields=sys.argv[3].split(',')
    for ID in fields:
        mouseFields.append(int(ID))
    mouseFields.sort()
    humanTable = sys.argv[4]
    humanFields=[]
    fields=sys.argv[5].split(',')
    for ID in fields:
        humanFields.append(int(ID))
    humanFields.sort()
    outfilename = sys.argv[6]

    outfile = open(outfilename, 'w')

    MouseToHumanHomologyDict={}

    lineslist  = open(datafilename)
    for line in lineslist:
        fields=line.strip().split('\t')
        mouseName = fields[0]
        mouseIDs = fields[5].split(',')
        humanName = fields[3]
        humanIDs = fields[6].split(',')
        for mouseID in mouseIDs:
            MouseToHumanHomologyDict[mouseID] = (mouseName, humanName, humanIDs)

    header = '#mouseID\tmouseName\thumanID\thumanName\t'

    headerDict={}

    MouseDataDict={}
    MouseIDToLabelDict={}
    lineslist  = open(mouseTable)
    for line in lineslist:
        fields=line.strip().split('\t')
        if line.startswith('#'):
            i=4
            for ID in mouseFields:
                label = fields[ID]
                MouseIDToLabelDict[ID]=label
                MouseDataDict[label]={}
                header = header + label + '\t'
                headerDict[i]=label
            continue
        geneID = fields[0]
        MouseDataDict[geneID]={}
        for ID in mouseFields:
            MouseDataDict[geneID][MouseIDToLabelDict[ID]] = fields[ID]

    HumanDataDict={}
    HumanIDToLabelDict={}
    lineslist  = open(humanTable)
    for line in lineslist:
        fields=line.strip().split('\t')
        if line.startswith('#'):
            for ID in humanFields:
                label = fields[ID]
                HumanIDToLabelDict[ID]=label
                HumanDataDict[label]={}
                header = header + label + '\t'
            continue
        geneID = fields[0]
        HumanDataDict[geneID]={}
        for ID in humanFields:
            HumanDataDict[geneID][HumanIDToLabelDict[ID]] = fields[ID]

    outfile.write(header.strip() + '\n')

    for mouseID in MouseDataDict.keys():
        if MouseToHumanHomologyDict.has_key(mouseID):
            (mouseName, humanName, humanIDs) = MouseToHumanHomologyDict[mouseID]
            MouseToHumanHomologyDict[mouseID]
        else:
            continue
        for humanID in humanIDs:
            if HumanDataDict.has_key(humanID):
                pass
            else:
                continue
            outline = mouseID + '\t' + mouseName + '\t' + humanID + '\t' + humanName + '\t'
            for ID in mouseFields:
                outline = outline + MouseDataDict[mouseID][MouseIDToLabelDict[ID]] + '\t'
            for ID in humanFields:
                outline = outline + HumanDataDict[humanID][HumanIDToLabelDict[ID]] + '\t'
            outfile.write(outline.strip() + '\n')
            

    outfile.close()
        
run()

