##################################
#                                #
# Last modified 2017/08/07       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import numpy
import scipy.stats

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s input valuefields [-spearman]' % sys.argv[0]
        print '\tvaluefields format: either comma separated, or start:end (including start and end, 0-based)'
        print '\ttarget element IDs - comma separated'
        print '\tThe presence of a header line is assumed'
        sys.exit(1)
    
    input = sys.argv[1]

    doSpearman = False
    if '-spearman' in sys.argv:
        doSpearman = True

    valueFields=[]
    vfields = sys.argv[2].split(',')
    for FF in vfields:
        if ':' in FF:
            fields = FF.split(':')
            start = int(fields[0])
            end = int(fields[1])
            for f in range(start,end+1):
                valueFields.append(f)
        else:
             valueFields.append(int(FF))
    valueFields.sort()

    print valueFields

    DataDict = {}

#    L = 0    
    linelist = open(input)
    for line in linelist:
#        L+=1
#        if L % 1000000 == 0:
#            print L
        fields=line.replace('\x00','').strip().split('\t')
        if line.startswith('#'):
            for ID in valueFields:
                DataDict[ID] = {}
                DataDict[ID]['label'] = fields[ID]
                DataDict[ID]['values'] = []
            continue
        for ID in valueFields:
            DataDict[ID]['values'].append(float(fields[ID]))

    outline = '#'
    for ID in valueFields:
        outline = outline + '\t' + DataDict[ID]['label']
    print outline

    for ID1 in valueFields:
        outline = DataDict[ID1]['label']
        for ID2 in valueFields:
            if doSpearman:
                CC = scipy.stats.spearmanr(DataDict[ID1]['values'],DataDict[ID2]['values'])[0]
            else:
                CC = numpy.corrcoef(DataDict[ID1]['values'],DataDict[ID2]['values'])[0,1]
            outline = outline + '\t' + str(CC)
        print outline

run()
