##################################
#                                #
# Last modified 2025/02/13       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import os
import math
import string

def getReverseComplement(preliminarysequence):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N','a':'t','t':'a','g':'c','c':'g','n':'n'}
    sequence=''
    for j in range(len(preliminarysequence)):
        sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-j-1]]
    return sequence

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s cdt outprefix extension_radius [-max] [-min]' % sys.argv[0]
        print '\tthe cdt file can be in .gz or .bz2 format, but it has to end with these suffixes' 
        sys.exit(1)

    CDT = sys.argv[1]
    outprefix = sys.argv[2]
    extR = int(sys.argv[3])

    LabelDict = {}
    ChrDict = {}

    doMax = False
    if '-max' in sys.argv:
        doMax = True
        print 'will output only the max effect (by absolute value)'

    doMin = False
    if '-min' in sys.argv:
        doMin = True
        print 'will output only the minimum effect value'

    i=0
    if CDT.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + CDT
    elif CDT.endswith('.gz'):
        cmd = 'zcat ' + CDT
    else:
        cmd = 'cat ' + CDT
    p = os.popen(cmd, "r")
    line = 'line'
    CL = 0
    while line != '':
        line = p.readline()
        CL += 1
        if CL % 10000 == 0:
            print CL
        fields = line.strip().split('\t')
        if CL == 1:
            for i in range(3,len(fields)):
                LabelDict[i] = {}
                LabelDict[i]['label'] = fields[i].replace('#','').replace(' ','_').replace(',','')
                LabelDict[i]['data'] = {}
            continue
        if CL == 2:
            continue
        if CL == 3:
            continue
        if CL == 4:
            continue
        try:
            chr = fields[1].replace('#','').split('_')[0]
        except:
            print 'skipping', fields
            continue
        pos = int(fields[1].split('_')[1])
        for i in range(4,len(fields)):
            if LabelDict[i]['data'].has_key(chr):
                pass
            else:
                LabelDict[i]['data'][chr] = {}
            LabelDict[i]['data'][chr][pos] = fields[i]

    if doMax:
        outfilename = outprefix + '.' + 'max'
        outfile = open(outfilename, 'w')
        maxDict = {}
        for i in LabelDict.keys():
            label = LabelDict[i]['label']
            print i, label
            chromosomes = LabelDict[i]['data'].keys()
            chromosomes.sort()
            for chr in chromosomes:
                if maxDict.has_key(chr):
                    pass
                else:
                    maxDict[chr]={}
                positions = LabelDict[i]['data'][chr].keys()
                positions.sort()
                for pos in positions:
                    if maxDict[chr].has_key(pos):
                        pass
                    else:
                        maxDict[chr][pos]=[]
                    maxDict[chr][pos].append(float(LabelDict[i]['data'][chr][pos]))
        for chr in chromosomes:
            positions = LabelDict[i]['data'][chr].keys()
            positions.sort()
            for pos in positions:
                minFC = min(maxDict[chr][pos])
                maxFC = max(maxDict[chr][pos])
                if math.fabs(minFC) > math.fabs(maxFC):
                    maxFC = minFC
                outline = chr + '\t' + str(pos - extR)  + '\t' + str(pos + extR) + '\t' + str(maxFC)
                outfile.write(outline + '\n')
        outfile.close()
    elif doMin:
        outfilename = outprefix + '.' + 'min'
        outfile = open(outfilename, 'w')
        maxDict = {}
        for i in LabelDict.keys():
            label = LabelDict[i]['label']
            print i, label
            chromosomes = LabelDict[i]['data'].keys()
            chromosomes.sort()
            for chr in chromosomes:
                if maxDict.has_key(chr):
                    pass
                else:
                    maxDict[chr]={}
                positions = LabelDict[i]['data'][chr].keys()
                positions.sort()
                for pos in positions:
                    if maxDict[chr].has_key(pos):
                        pass
                    else:
                        maxDict[chr][pos]=[]
                    maxDict[chr][pos].append(float(LabelDict[i]['data'][chr][pos]))
        for chr in chromosomes:
            positions = LabelDict[i]['data'][chr].keys()
            positions.sort()
            for pos in positions:
                minFC = min(maxDict[chr][pos])
                outline = chr + '\t' + str(pos - extR)  + '\t' + str(pos + extR) + '\t' + str(minFC)
                outfile.write(outline + '\n')
        outfile.close()
    else:
        for i in LabelDict.keys():
            label = LabelDict[i]['label']
            print i, label
            outfilename = outprefix + '.' + str(i)
            outfile = open(outfilename, 'w')
            chromosomes = LabelDict[i]['data'].keys()
            chromosomes.sort()
            for chr in chromosomes:
                positions = LabelDict[i]['data'][chr].keys()
                positions.sort()
                for pos in positions:
                    outline = chr + '\t' + str(pos - extR)  + '\t' + str(pos + extR) + '\t' + str(LabelDict[i]['data'][chr][pos])
                    outfile.write(outline + '\n')
            outfile.close()

run()
