##################################
#                                #
# Last modified 08/31/2009       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s GENCODE-gtf radius outfilename' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    radius = int(sys.argv[2])
    outputfilename = sys.argv[3]

    doUnique=False
    if '-unique' in sys.argv:
        doUnique=True

    outfile = open(outputfilename, 'w')

    listoflines = open(inputfilename)
    TSSDict={}
    i=0
    for line in listoflines:
        if line.startswith('#'):
            continue
        i+=1
        if i % 100000 == 0:
            print i, 'lines processed'
        fields=line.split('\t')
        if fields[2] != 'gene' and fields[2] != 'transcript':
            continue
        genename=fields[8].split('gene_name "')[1].split('";')[0]
        chr=fields[0]
        strand=fields[6]
        left=fields[3]
        right=fields[4]
        if TSSDict.has_key(genename):
            pass
        else:
            TSSDict[genename]=[]
        TSSDict[genename].append((chr,left,right,strand))

    print 'finished unputting annotation'            

    genes=TSSDict.keys()
    genes.sort()
    for genename in genes:
        UniqueList=[]
        for (chr,left,right,strand) in TSSDict[genename]:
            if strand == '+':
                UniqueList.append((chr,int(left),strand))
            if strand == '-':
                UniqueList.append((chr,int(right),strand))
        UniqueList=list(Set(UniqueList))
        for (chr,TSS,strand) in UniqueList:
            outline=genename+'\t'+chr+'\t'+str(TSS-radius)+'\t'+str(TSS+radius)+'\t'+strand+'\n'
            outfile.write(outline)

    outfile.close()

run()

