##################################
#                                #
# Last modified 2018/09/15       #
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import random
import os
from sets import Set
import time

def run():

    if len(sys.argv) < 8:
        print 'usage: python %s guides.csv CFD_fieldID minCFD regions.bed chrFieldID leftFieldID rightFieldID outfile [-bedExtend left right]' % sys.argv[0]
        print '\tAssumed format:'
        print '\t\tchr1:79033-79138__chr1:79029-79051:+,36,0.09309323,71;2:7|3:64,GGCCACCTGGCCTGGGACTC'
        print '\ti.e., the region coordinates, the same as in the bed file, are recorded as chr1:79033-79138 in the first field'
        print '\tall input files can be zipped'
        sys.exit(1)
  
    guides = sys.argv[1]
    CFDFieldID = int(sys.argv[2])
    minCFD = float(sys.argv[3])
    regions = sys.argv[4]
    chrFieldID = int(sys.argv[5])
    leftFieldID = int(sys.argv[6])
    rightFieldID = int(sys.argv[7])
    outfilename = sys.argv[8]

    doExtend = False
    if '-bedExtend' in sys.argv:
        doExtend = True
        LExt = int(sys.argv[sys.argv.index('-bedExtend') + 1])
        RExt = int(sys.argv[sys.argv.index('-bedExtend') + 2])
        print 'will assume regions have been extended by', LExt, 'and', RExt, 'bp'

    RegionDict = {}

    if regions.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + regions
    elif regions.endswith('.gz'):
        cmd = 'gunzip -c ' + regions
    elif regions.endswith('.zip'):
        cmd = 'unzip -p ' + regions
    else:
        cmd = 'cat ' + regions
    p = os.popen(cmd, "r")
    line = 'line'
    while line != '':
        line = p.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        left = fields[leftFieldID]
        right = fields[rightFieldID]
        if doExtend:
            left = str(int(left) - LExt)
            right = str(int(right) + RExt)
        region = chr + ':' + left + '-' + right
        RegionDict[region] = []

    print 'finished inputing regions'

    if guides.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + guides
    elif guides.endswith('.gz'):
        cmd = 'gunzip -c ' + guides
    elif guides.endswith('.zip'):
        cmd = 'unzip -p ' + guides
    else:
        cmd = 'cat ' + guides
    p = os.popen(cmd, "r")
    L = 0
    line = 'line'
    while line != '':
        line = p.readline().strip()
        if line == '':
            break
        if line.startswith('#'):
            continue
        L += 1
        if L % 1000000 == 0:
            print str(L/1000000) + 'M lines processed'
        fields = line.strip().split(',')
        region = fields[0].split('__')[0]
        CFD = float(fields[CFDFieldID])
        RegionDict[region].append(CFD)

    outfile = open(outfilename, 'w')

    for region in RegionDict.keys():
        CFDs = RegionDict[region]
        outline = region + '\t' + str(len(CFDs)) + '\t' + str(sum(i > minCFD for i in CFDs))
        outfile.write(outline + '\n')

    outfile.close()

run()
