##################################
#                                #
# Last modified 09/02/2009       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s GENCODE-gtf radius-to-closest-genes outfileprefix [-singlemodelgenes] [-biotype biotype]' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    radius = int(sys.argv[2])
    outputfilename = sys.argv[3]

    doSingle=False
    if '-singlemodelgenes' in sys.argv:
        doSingle=True

    doBioType=False
    if '-biotype' in sys.argv:
        doBioType=True
        biotype=sys.argv[sys.argv.index('-biotype')+1]
        print 'will only consider', biotype, 'genes'

    outfile = open(outputfilename, 'w')

    listoflines = open(inputfilename)

    Dict3={}
    Dict5={}

    GeneDict={}
    ExonDict={}

    i=0
    for line in listoflines:
        if line.startswith('#'):
            continue
        i+=1
        if i % 100000 == 0:
            print i, 'lines processed'
        fields=line.split('\t')
        name=fields[8].split('gene_name "')[1].split('";')[0]
        chr=fields[0]
        strand=fields[6]
        left=int(fields[3])
        right=int(fields[4])
        type=fields[2]
        BT = fields[8].split('gene_type "')[1].split('";')[0]
        if fields[2] == 'transcript':
            if GeneDict.has_key(name):
                pass
            else:
                GeneDict[name]=[]
            GeneDict[name].append((chr,left,right,strand,BT))
        if fields[2] == 'exon':
            if ExonDict.has_key(chr):
                pass
            else:
                ExonDict[chr]=[]
            ExonDict[chr].append((chr,left,right,strand,BT))

    for name in GeneDict.keys():
        if len(GeneDict[name])>1 and doSingle:
            continue
        for (chr,left,right,strand,BT) in GeneDict[name]:
            if Dict3.has_key(chr) and Dict5.has_key(chr):
                pass
            else:
                Dict3[chr]=[]
                Dict5[chr]=[]
            if strand=='+':
                Dict3[chr].append((name,chr,right,strand,BT))
                Dict5[chr].append((name,chr,left,strand,BT))
            if strand=='-':
                Dict5[chr].append((name,chr,right,strand,BT))
                Dict3[chr].append((name,chr,left,strand,BT))

    for chr in ExonDict.keys():
        ExonDict[chr]=list(Set(ExonDict[chr]))
        ExonDict[chr].sort()

    for chr in Dict3.keys():
        print chr
        Dict3[chr].sort()
        for (name,chr,pos,strand,BT) in Dict3[chr]:
            isolated=True
            if strand == '+':
                for (Echr,Eleft,Eright,Estrand,BT) in ExonDict[chr]:
                    if Eright < pos:
                        continue
                    if Eright == pos:
                        continue
                    if Eleft < pos+radius and Eleft > pos:
                        isolated=False
                        break
                    if Eleft > pos+radius:
                        break
            if strand == '-':
                for (Echr,Eleft,Eright,Estrand,BT) in ExonDict[chr]:
                    if Eleft < pos-radius:
                        continue
                    if Eleft == pos:
                        continue
                    if Eright > pos-radius and Eright < pos:
                        isolated=False
                        break
                    if Eleft > pos:
                        break
            if isolated:
                if doBioType and BT != biotype:
                    continue
                outline=name+'\t'+chr+'\t'+str(pos)+'\t'+strand
                outfile.write(outline+'\n')

                    
    print 'finished unputting annotation'            

    outfile.close()

run()

