##################################
#                                #
# Last modified 12/17/2015       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
import numpy

def run():

    if len(sys.argv) < 4:
        print 'usage: python %s regions chrFieldID fasta outpurfilename' % sys.argv[0]
        sys.exit(1)

    regions = sys.argv[1]
    chrFieldID = int(sys.argv[2])
    fasta = sys.argv[3]
    outfilename = sys.argv[4]

    GenomeDict={}
    sequence=''
    inputdatafile = open(fasta)
    for line in inputdatafile:
        if line[0]=='>':
            if sequence != '':
                GenomeDict[chr] = ''.join(sequence).upper()
            chr = line.strip().split('>')[1]
            sequence=[]
            Keep=False
            continue
        else:
            sequence.append(line.strip())
    GenomeDict[chr] = ''.join(sequence).upper()

    outfile = open(outfilename, 'w')

    Ls = []
    LnoNs = []

    inputdatafile = open(regions)
    for line in inputdatafile:
        if line.startswith('#'):
            outfile.write(line)
            continue
        fields = line.strip().split('\t')
        chr = fields[chrFieldID]
        left = int(fields[chrFieldID + 1])
        right = int(fields[chrFieldID + 2])
        L = right - left
        Ls.append(L)
        outline = line.strip() + '\t' + str(L)
        for i in range(left,right):
            if GenomeDict[chr][i] == 'N':
                L-=1
        LnoNs.append(L)
        outline = outline + '\t' + str(L)
        outfile.write(outline + '\n')

    outline = '#Average_with_Ns:\t' + str(numpy.mean(numpy.array(Ls)))
    outfile.write(outline + '\n')
    outline = '#Average_without_Ns:\t' + str(numpy.mean(numpy.array(LnoNs)))
    outfile.write(outline + '\n')
    outline = '#Median_with_Ns:\t' + str(numpy.median(numpy.array(Ls)))
    outfile.write(outline + '\n')
    outline = '#Median_without_Ns:\t' + str(numpy.median(numpy.array(LnoNs)))
    outfile.write(outline + '\n')

    outfile.close()
   
run()
