##################################
#                                #
# Last modified 2017/01/02       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import copy
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s cog.csv outfile_prefix [-useALL 0,1,2,3]' % sys.argv[0]
        print '\tNote: the script will output only a single COG match per species, and it will be the best hit; hits other than perfect matches will be printed as ? signs'
        print '\t\tuse the [-useALL] option if you want all hits as 1 characters'
        sys.exit(1)

    input = sys.argv[1]
    outprefix = sys.argv[2]

    doUseAll = False
    if '-useALL' in sys.argv:
        doUseAll = True
        COGMatchTypes = sys.argv[sys.argv.index('-useALL') + 1].split(',')
        if '0' not in COGMatchTypes:
            COGMatchTypes.append('0')

    COGMatchDict = {}
    COGDict = {}
    
    linelist=open(input)
    for line in linelist:
        fields = line.strip().split(',')
        if len(fields) < 8:
            continue
        species = fields[1]
        
        COG = fields[6]
        COGDict[COG] = 1
        COGmatch = fields[7]
        if COGMatchDict.has_key(species):
            pass
        else:
            COGMatchDict[species] = {}
        if COGMatchDict[species].has_key(COG):
            if COGmatch == 0 and COGMatchDict[species][COG] != 0:
                COGMatchDict[species][COG] = COGmatch
            else:
                pass
        else:
            COGMatchDict[species][COG] = COGmatch

    COGs = COGDict.keys()
    COGs.sort()

    outfile = open(outprefix + '.COG_order', 'w')

    i=0
    for COG in COGs:
        outline = str(i) + '\t' + COG
        outfile.write(outline + '\n')
        i+=1

    outfile.close()

    outfile = open(outprefix + '.fa', 'w')

    speciesList = COGMatchDict.keys()
    speciesList.sort()

    for species in speciesList:
        outline = '>' + species
        outfile.write(outline + '\n')
        outline = ''
        for COG in COGs:
            if COGMatchDict[species].has_key(COG):
                if doUseAll:
                    if COGMatchDict[species][COG] in COGMatchTypes:
                        outline = outline + '1'
                    else:
                        outline = outline + '?'
                else:
                    if COGMatchDict[species][COG] == '0':
                        outline = outline + '1'
                    else:
                        outline = outline + '?'
            else:
                outline = outline + '0'
        outfile.write(outline + '\n')

    outfile.close()

   
run()
