##################################
#                                #
# Last modified 2017/07/18       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import os
from sets import Set

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s file1 chromField1 file2 chromField2 outfilename [-ignoreFile2Misses]' % sys.argv[0]
        print '\tNote: To decrease memory usage, list the file with smaller total genomic footprint first'
        print "\tNote: you can use gzipped and bzipped files, the script will detect those automaticlaly if they end in .gz or .bz2"
        sys.exit(1)

    file1 = sys.argv[1]
    file2 = sys.argv[3]
    chromField1 = int(sys.argv[2])
    chromField2 = int(sys.argv[4])
    outfilename = sys.argv[5]
    minOverlap=0
    if '-minOverlap' in sys.argv:
        minOverlap=float(sys.argv[sys.argv.index('-minOverlap')+1])

    CoverageDict1={}
    RegionDict1={}

    doIgnoreFile2Misses = False
    if '-ignoreFile2Misses' in sys.argv:
        doIgnoreFile2Misses = True

    i=0
    if file1.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + file1
    elif file1.endswith('.gz'):
        cmd = 'gunzip -c ' + file1
    else:
        cmd = 'cat ' + file1
    p1 = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p1.readline()
        if line.startswith('#') or line.startswith('track ') or line.strip() == '':
            continue
        fields=line.strip().split('\t')
        lenFields1=len(fields)
        chr=fields[chromField1]
        start=int(float(fields[chromField1+1]))
        stop=int(float(fields[chromField1+2]))
        if CoverageDict1.has_key(chr):
            pass
        else:
            CoverageDict1[chr]={}
        for j in range(start,stop):
            if CoverageDict1[chr].has_key(j):
                pass
            else:
                CoverageDict1[chr][j] = []
            CoverageDict1[chr][j].append(i)
        RegionDict1[i]=fields
        i+=1

    outfile = open(outfilename, 'w')

    CoverageDict2={}

    if file2.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + file2
    elif file2.endswith('.gz'):
        cmd = 'gunzip -c ' + file2
    else:
        cmd = 'cat ' + file2
    p2 = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p2.readline()
        if line.startswith('#') or line.startswith('track ') or line.strip() == '':
            continue
        fields=line.strip().split('\t')
        lenFields2=len(fields)
        chr=fields[chromField2]
        start=int(float(fields[chromField2+1]))
        stop=int(float(fields[chromField2+2]))
        if CoverageDict2.has_key(chr):
            pass
        else:
            CoverageDict2[chr]={}
        for j in range(start,stop):
            CoverageDict2[chr][j]=i
        OverlappRegions = {}
        for j in range(start,stop):
            if CoverageDict1.has_key(chr) and CoverageDict1[chr].has_key(j):
                for r in CoverageDict1[chr][j]:
                    OverlappRegions[r] = 1
        if len(OverlappRegions.keys()) > 0:
            for region1 in OverlappRegions.keys():
                outline=''
                fields1 = RegionDict1[region1]
                for f in range(lenFields1):
                    outline=outline+fields1[f]+'\t'
                for f in range(lenFields2):
                    outline=outline+fields[f]+'\t'
                outfile.write(outline.strip()+'\n')
        else:
            if doIgnoreFile2Misses:
                pass
            else:
                outline=''
                for f in range(lenFields1):
                    outline=outline+'-\t'
                for f in range(lenFields2):
                    outline=outline+fields[f]+'\t'
                outfile.write(outline.strip()+'\n')
        i+=1

    if file1.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + file1
    elif file1.endswith('.gz'):
        cmd = 'gunzip -c ' + file1
    else:
        cmd = 'cat ' + file1
    p1 = os.popen(cmd, "r")
    line = '.'
    while line != '':
        line = p1.readline()
        if line.startswith('#') or line.startswith('track ') or line.strip() == '':
            continue
        fields = line.strip().split('\t')
        chr = fields[chromField1]
        start = int(fields[chromField1+1])
        stop = int(fields[chromField1+2])
        Overlap = False
        for j in range(start,stop):
            if CoverageDict2.has_key(chr) and CoverageDict2[chr].has_key(j):
                Overlap=True
                break
        outline=''
        if Overlap:
            continue
        else:
            for f in range(lenFields1):
                outline=outline+fields[f]+'\t'
            for f in range(lenFields2):
                outline=outline+'-\t'
            outfile.write(outline.strip()+'\n')

    outfile.close()


run()
