##################################
#                                #
# Last modified 06/02/2014       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import math
import random
import string
from sets import Set

def run():

    if len(sys.argv) < 6:
        print 'usage: python %s gtf RPM_wiggle 3-end-radius neighborhood_radius minFPKM outfile [-bioTypes biotype1,biotype2,...]' % sys.argv[0]
        print '\tThe script will take an annotation and a coverage track (the track has to be in RPM units!!!), and will' 
        print '\tlook for genes around the 3-end of which there is coverage at levels highers than the indicated minFPKM'
        print '\tthreshold over a region of size equal to the 3-radius that are inconsisten with the annotation. Where '
        print '\tthe script searches for such regions is control by the region around the 3-end and the larger neighbourhood'
        print '\tradius parameter around it'
        print '\tNOT FINISHED'
        sys.exit(1)

    gtf = sys.argv[1]
    wig = sys.argv[2]
    radius3 = int(sys.argv[3])
    radiusN = int(sys.argv[4])
    minRPKM = float(sys.argv[5])
    outputfilename = sys.argv[6]

    doBT = False
    if '-bioTypes' in sys.argv:
        doBT = True
        WantedBTs = {}
        for BT in sys.argv[sys.argv.index('-bioTypes') + 1].split(','):
            WantedBTs[BT] = 1

    outfile = open(outputfilename, 'w')

    lineslist = open(gtf)
    GeneDict={}
    for line in lineslist:
        if line.startswith('#'):
            continue
        fields=line.strip().split('\t')
        if doBT:
            BT = fields[8].split('gene_type "')[1].split('";')[0]
            if WantedBTs.has_key(BT):
                pass
            else:
                continue
        if fields[2] != 'exon':
            continue
        if 'gene_name "' in fields[8]:
            GeneName=fields[8].split('gene_name "')[1].split('";')[0]
        else:
            GeneName=fields[8].split('gene_id "')[1].split('";')[0]
        GeneID=fields[8].split('gene_id "')[1].split('";')[0]
        if 'transcript_name "' in fields[8]:
            TranscriptName=fields[8].split('transcript_name "')[1].split('";')[0]
        else:
            TranscriptName=fields[8].split('transcript_id "')[1].split('";')[0]
        TranscriptID=fields[8].split('transcript_id "')[1].split('";')[0]
        if GeneDict.has_key((GeneID,GeneName)):
            pass
        else:
            GeneDict[(GeneID,GeneName)]={}
        if GeneDict[(GeneID,GeneName)].has_key((TranscriptID,TranscriptName)):
            pass
        else:
            GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)]=[]
        chr=fields[0]
        left=int(fields[3])
        right=int(fields[4])
        orientation=fields[6]
        GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)].append((chr,left,right,orientation))

    CoverageDict = {}
    for (GeneID,GeneName) in GeneDict.keys():
        for (TranscriptID,TranscriptName) in GeneDict[(GeneID,GeneName)]:
            GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)].sort()
            strand = GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)][0][3]
            chr = GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)][0][0]
            if strand == '+':
                end3 = GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)][-1][2]
            if strand == '-':
                end3 = GeneDict[(GeneID,GeneName)][(TranscriptID,TranscriptName)][0][1]
            if CoverageDict.has_key(chr):
                pass
            else:
                CoverageDict[chr] = {}
            for i in range(end3 - radiusN,end3 + radiusN,):
                CoverageDict[chr][i] = 0

    lineslist = open(wig)
    for line in lineslist:
        fields = line.strip()
        chr = fields[0]
        left = int(fields[1])
        right = int(fields[2])
        score = float(fields[3])
        if CoverageDict.has_key(chr):
            pass
        else:
            continue
        for i in range(left,right):
            if CoverageDict[chr].has_key(i):
                CoverageDict[chr][i] = score

    outfile = open(outputfilename,'w')


    outfile.close()

run()

