##################################
#                                #
# Last modified 2018/11/26       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import os
import math
from sets import Set

def run():

    if len(sys.argv) < 9:
        print 'usage: python %s TSS.bed chrFieldID posFieldID strandFieldID nucleosomes chrFieldID posFieldID maxRadius outfile' % sys.argv[0]
        sys.exit(1)

    TSS = sys.argv[1]
    TSSchrFieldID = int(sys.argv[2])
    TSSposFieldID = int(sys.argv[3])
    TSSstrandFieldID = int(sys.argv[4])
    nucleosomes = sys.argv[5]
    NucchrFieldID = int(sys.argv[6])
    NucposFieldID = int(sys.argv[7])
    MaxR = int(sys.argv[8])
    outfilename = sys.argv[9]

    NucDict = {}

    if nucleosomes.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + nucleosomes
    elif nucleosomes.endswith('.gz') or nucleosomes.endswith('.bgz'):
        cmd = 'zcat ' + nucleosomes
    elif nucleosomes.endswith('.zip'):
        cmd = 'unzip -p ' + nucleosomes
    else:
        cmd = 'cat ' + nucleosomes
    RN = 0
    P = os.popen(cmd, "r")
    peakline = 'line'
    while peakline != '':
        peakline = P.readline().strip()
        if peakline == '':
            break
        if peakline.startswith('#'):
            continue
        fields = peakline.strip().split('\t')
        chr = fields[NucchrFieldID]
        pos = int(fields[NucposFieldID])
        if NucDict.has_key(chr):
            pass
        else:
            NucDict[chr] = {}
        NucDict[chr][pos] = 1

    outfile = open(outfilename,'w')

    if TSS.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + TSS
    elif TSS.endswith('.gz') or TSS.endswith('.bgz'):
        cmd = 'zcat ' + TSS
    elif TSS.endswith('.zip'):
        cmd = 'unzip -p ' + TSS
    else:
        cmd = 'cat ' + TSS
    RN = 0
    P = os.popen(cmd, "r")
    peakline = 'line'
    while peakline != '':
        peakline = P.readline().strip()
        if peakline == '':
            break
        if peakline.startswith('#'):
            outline = peakline.strip() + '\tUpstream_Nucleosome_Position\tDownstream_Nucleosome_Position'
            outfile.write(outline + '\n')
            continue
        fields = peakline.strip().split('\t')
        RN += 1
        if RN % 1000 == 0:
            print RN
        chr = fields[TSSchrFieldID]
        pos = int(fields[TSSposFieldID])
        strand = fields[TSSstrandFieldID]
        if NucDict.has_key(chr):
            pass
        else:
            continue
        NearestUpstream = 1000000000000000000000
        NearestDownstream = 1000000000000000000000
        if strand == '+':
            i = pos
            while NearestUpstream == 1000000000000000000000 and math.fabs(pos-i) < MaxR:
                i = i-1
                if NucDict[chr].has_key(i):
                     NearestUpstream = int(math.fabs(i))
            i = pos
            while NearestDownstream == 1000000000000000000000 and math.fabs(pos-i) < MaxR:
                i = i+1
                if NucDict[chr].has_key(i):
                     NearestDownstream = int(math.fabs(i))
        if strand == '-':
            i = pos
            while NearestDownstream == 1000000000000000000000 and math.fabs(pos-i) < MaxR:
                i = i-1
                if NucDict[chr].has_key(i):
                     NearestDownstream = int(math.fabs(i))
            i = pos
            while NearestUpstream == 1000000000000000000000 and math.fabs(pos-i) < MaxR:
                i = i+1
                if NucDict[chr].has_key(i):
                     NearestUpstream = int(math.fabs(i))
        if NearestUpstream == 1000000000000000000000:
            continue
        if NearestDownstream == 1000000000000000000000:
            continue
        outline = peakline.strip() + '\t' + str(NearestUpstream) + '\t' + str(NearestDownstream)
        outfile.write(outline + '\n')

    outfile.close()

run()

