##################################
#                                #
# Last modified 01/16/2011       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import numpy
import random
from sets import Set

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s inputfilename start_field outfilename [-relative_to_mean percentile]' % sys.argv[0]
        print '       start_field - the field where data start; the script will take all numerical values after that in the row and scale them from 0 to 1'
        print '       -relative_to_mean option: the mean of the bottom percentile (enter it as a fraction, i.e. 0.90) is calculated, substracted from all values (including the top ones) and all scores divided by the mean'
        sys.exit(1)

    input = sys.argv[1]
    startfieldID = int(sys.argv[2])
    outfilename = sys.argv[3]

    doRTM = False
    if '-relative_to_mean' in sys.argv:
        doRTM = True
        RTMP = float(sys.argv[sys.argv.index('-relative_to_mean')+1])

    outfile = open(outfilename, 'w')

    lineslist = open(input)
    for line in lineslist:
        fields = line.strip().split('\t')
        if len(fields) < startfieldID + 2:
            print 'skipping line:', line.strip()
            continue
        if line.startswith('#') or line.startswith('@'):
            outline = ''
            for i in range(startfieldID):
                outline = outline + fields[i] + '\t'
            outline = outline + 'Dataset\t'
            for i in range(startfieldID,len(fields)):
                outline = outline + fields[i] + '\t'
            outfile.write(outline.strip() + '\n')
            continue
        outline = ''
        for i in range(startfieldID):
            outline = outline + fields[i] + '\t'
        outline = outline + fields[0] + '\t'
        values = []
        for i in range(startfieldID,len(fields)):
            values.append(float(fields[i]))
        values = numpy.array(values)
        max = numpy.max(values)
        if max == 0:
            for i in range(startfieldID,len(fields)):
                outline = outline + '0\t'
        else:
            if doRTM:
                values.sort()
                maxpos = int(RTMP*len(values))
                RTMPmean = numpy.mean(values[0:maxpos])
                for i in range(startfieldID,len(fields)):
                    outline = outline + str((float(fields[i])-RTMPmean)/RTMPmean) + '\t'
            else:
                for i in range(startfieldID,len(fields)):
                    outline = outline + str(float(fields[i])/max) + '\t'
        outfile.write(outline.strip() + '\n')
                    

    outfile.close()

run()
