##################################
#                                #
# Last modified 9/3/2019         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome

def run():

    if len(sys.argv) < 7:
        print 'usage: python %s genome regionfilename chrfieldID peakfieldID radius running_average_window outfilename [-rankby fieldID (numbers only)]' % sys.argv[0]
        sys.exit(1)

    cachePages = 2000000
    
    genome = sys.argv[1]
    inputfilename = sys.argv[2]
    chrfieldID = int(sys.argv[3])
    peakID = int(sys.argv[4])
    radius = int(sys.argv[5])
    running_average_window = int(sys.argv[6])
    outfilename = sys.argv[7]

    outfile = open(outfilename, 'w')
    hg = Genome(genome)
    
    doSort=False
    if '-rankby' in sys.argv:
        doSort=True
        sortID=int(sys.argv[sys.argv.index('-rankby')+1])

    inputdatalist = open(inputfilename)
    n=0
    i=0

    half_average=int(running_average_window/2.0)

    sequenceDict={}
    for line in inputdatalist:
        i+=1
        if i % 100 == 0:
            print i,
        if line.startswith("#"):
            continue
        fields = line.split('\n')[0].split('\t')
        sense = 'F'
        chromosome = fields[chrfieldID]
        peakPos=int(fields[peakID])
        start = peakPos-radius-half_average
        stop = peakPos+radius+half_average
        ID=chromosome+':'+str(peakPos)
        if doSort:
            ID=(float(fields[sortID]),ID)
        try:
            chr = chromosome[3:len(chromosome)]
            sequence = string.upper(hg.sequence(chr,start,stop-start))
            sequenceDict[ID]=sequence
        except:
            print 'could not find', chromosome, start, stop
            n+=1

    print 'could not parse sequence for', n, 'out of', i, 'regions'


    outline='#ID'
    for i in range(0-radius,0+radius):
        outline=outline+'\t'+str(i)
    outfile.write(outline+'\n')

    keys=sequenceDict.keys()
    keys.sort()
    keys.reverse()
    for ID in keys:
        if doSort:
            outline=ID[1]
        else:
            outline=ID
        sequence=sequenceDict[ID]
        scoreDict={}
        for i in range(half_average,2*radius+half_average):
            GC=0.0
            for j in range(i-half_average,i+half_average):
                if sequence[j]=='G' or sequence[j]=='C':
                    GC+=1
            scoreDict[i]=GC/running_average_window
        scores=scoreDict.keys()
        scores.sort()
        for score in scores:
            outline=outline+'\t'+str(scoreDict[score]-0.5)
        outfile.write(outline+'\n')

run()
