##################################
#                                #
# Last modified 2019/03/17       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
import os
import pyBigWig
from sets import Set

def run():

    if len(sys.argv) < 9:
        print 'usage: python %s config exons.bed chrFieldID leftFieldID rightFieldID geneIDFieldID minCFD cutting/editing_radius outputfilename [-offset bp]' % sys.argv[0]
        print '\tconfig files format:' 
        print '\tgeneID\tguides.csv' 
        sys.exit(1)
    
    config = sys.argv[1]
    exons = sys.argv[2]
    chrFieldID = int(sys.argv[3])
    leftFieldID = int(sys.argv[4])
    rightFieldID = int(sys.argv[5])
    GeneFieldID = int(sys.argv[6])
    minCFD = float(sys.argv[7])
    radius = int(sys.argv[8])
    outfilename = sys.argv[9]

    OS = 0
    doOffset = False
    if '-offset' in sys.argv:
        doOffset = True
        OS = int(sys.argv[sys.argv.index('-offset') + 1])
        print 'will shift cut sites regions by', OS, 'bp'

    ExonsDict = {}
    if exons.endswith('.bz2'):
        cmd = 'bzip2 -cd ' + exons
    elif exons.endswith('.gz'):
        cmd = 'zcat ' + exons
    else:
        cmd = 'cat ' + exons
    p = os.popen(cmd, "r")
    LC = 0
    line = 'line'
    while line != '':
        line = p.readline()
        if line == '':
            break
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        left = int(fields[leftFieldID])
        right = int(fields[rightFieldID])
        geneID = fields[GeneFieldID]
        if ExonsDict.has_key(geneID):
            pass
        else:
            ExonsDict[geneID] = {}
            ExonsDict[geneID]['exons'] = []
            ExonsDict[geneID]['guides'] = {}
        ExonsDict[geneID]['exons'].append((chr,left,right))

    outfile = open(outfilename,'w')
    outline = '#gene\texons\ttotal_length\ttotal_guides_passing_CFD\tfraction_of_bases_covered'
    outfile.write(outline+'\n')

    lineslist = open(config)
    for line in lineslist:
        fields = line.strip().split('\t')
        geneID = fields[0]
        guides = fields[1]
        if ExonsDict.has_key(geneID):
            pass
        else:
            print 'skipping gene', geneID, 'not found in exons file'
            continue
        if guides.endswith('.bz2'):
            cmd = 'bzip2 -cd ' + guides
        elif guides.endswith('.gz'):
            cmd = 'zcat ' + guides
        else:
            cmd = 'cat ' + guides
        p = os.popen(cmd, "r")
        LC = 0
        line = 'line'
        while line != '':
            line = p.readline()
            if line == '':
                break
            if line.startswith('#'):
                continue
            fields = line.strip().split(',')
            chr = fields[0].split(':')[0]
            left = int(fields[0].split(':')[-2].split('-')[0])
            right = int(fields[0].split(':')[-2].split('-')[1])
            strand = fields[0][-1]
            if strand == '+':
                cut = right - 3 - 3 + OS
            if strand == '-':
                cut = left + 3 + 3 - OS
            CFD = float(fields[2])
            if CFD < minCFD:
                continue
            ExonsDict[geneID]['guides'][(chr,cut,strand)] = (CFD)
        outline = geneID + '\t' + str(len(Set(ExonsDict[geneID]['exons'])))
        bases = []
        for (chr,left,right) in ExonsDict[geneID]['exons']:
            for i in range(left,right):
                bases.append((chr,i))
        bases = Set(bases)
        editable_positions = []
        for (chr,cut,strand) in ExonsDict[geneID]['guides'].keys():
            for i in range(cut-radius,cut+radius):
                editable_positions.append((chr,i))
        editable_positions = Set(editable_positions)
        wanted_editable_positions = bases.intersection(editable_positions)
        bases = len(bases)
        wanted_editable_positions = len(wanted_editable_positions)
        outline = outline + '\t' + str(bases)
        outline = outline + '\t' + str(len(ExonsDict[geneID]['guides'].keys())) + '\t' + str(wanted_editable_positions/(bases + 0.0))
        outfile.write(outline + '\n')
        print outline

    outfile.close()
   
run()
