##################################
#                                #
# Last modified 2017/07/27       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import string
import math
from sets import Set
import numpy as np
from MulticoreTSNE import MulticoreTSNE as TSNE

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s config fieldIDs Ncores perplexity' % sys.argv[0]
        print '\tconfig format: label <tab> filename' 
        print '\tfieldIDs format: some combination of comma-separation and from:to (included)' 
        print '\tthe script will print to stdout' 
        sys.exit(1)
    
    config = sys.argv[1]
    Flist = sys.argv[2].split(',')
    fieldIDs = []
    for F in Flist:
        if ':' in F:
            F1 = int(F.split(':')[0])
            F2 = int(F.split(':')[1])
            for i in range(F1,F2+1):
                fieldIDs.append(i)
        else:
            fieldIDs.append(int(F))
    Ncores = int(sys.argv[3])
    perp = int(sys.argv[4])

    G = []
    X = []

    listoflines = open(config)
    for line1 in listoflines:
        Fs = line1.strip().split('\t')
        label = Fs[0]
        file = Fs[1]
        if file.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + file
        elif file.endswith('.gz'):
            cmd = 'zcat ' + file
        else:
            cmd = 'cat ' + file
        p = os.popen(cmd, "r")
        line = 'line'
        while line != '':
            line = p.readline()
            if line == '':
                break
            if line.startswith('#'):
                continue
            fields=line.strip().split('\t')
            D = []
            for ID in fieldIDs:
                D.append(float(fields[ID]))
            X.append(D)
            G.append(label)

    X = np.array(X)
    tsne = TSNE(n_jobs=Ncores,perplexity=perp)
    mnist_tsne = tsne.fit_transform(X)

    labels = list(Set(G))
    labels.sort()

    outfile = open(sys.argv[5],'w')
    for L in labels:
        outline = outline + '\t' + L
    outfile.write(outline + '\n')

    i=0
    for (t1,t2) in mnist_tsne:
        outline = str(t1)
        g = G[i]
        ID = labels.index(g) + 1
        for j in range(1,ID):
            outline = outline + '\t'
        outline = outline + '\t' + str(t2)
        for j in range(ID,len(labels)+1):
            outline = outline + '\t'
        outfile.write(outline + '\n')
        i += 1
   
run()
