##################################
#                                #
# Last modified 04/28/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
from sets import Set

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s pep.fa table transcriptFieldID outfile_prefix' % sys.argv[0]
        sys.exit(1)

    PEP = sys.argv[1]
    TABLE = sys.argv[2]
    FieldID = int(sys.argv[3])
    outfile = open(sys.argv[4],'w')

    TtoPDict = {}
    linelist = open(PEP)
    for line in linelist:
        if line[0]=='>':
            try:
                T = line.split('DNA_ID=')[1].split(' ')[0]
                P = line.split('>')[1].split(' ')[0]
                TtoPDict[T] = P
            except:
                print 'problem with:'
                print line.strip()
        else:
            continue

    problematic = 0
    total = 0

    linelist = open(TABLE)
    for line in linelist:
        fields = line.strip().split('\t')
        outline = ''
        if line.startswith('#'):
            for i in range(0,FieldID + 1):
                outline = outline + fields[i] + '\t'
            outline = outline + 'pep' + '\t'
            for i in range(FieldID + 1,len(fields)):
                outline = outline + fields[i] + '\t'
            outfile.write(outline.strip() + '\n')
            continue
        T = fields[FieldID]
        if TtoPDict.has_key(T):
            P = TtoPDict[T]
            total += 1
        else:
            P = 'nan'
            problematic += 1
            total += 1
        for i in range(0,FieldID + 1):
            outline = outline + fields[i] + '\t'
        outline = outline + P + '\t'
        for i in range(FieldID + 1,len(fields)):
            outline = outline + fields[i] + '\t'
        outfile.write(outline.strip() + '\n')
        
    outfile.close()

    print problematic, 'ID conversions out of ', total

run()

