##################################
#                                #
# Last modified 12/16/2009       # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 3:
        print 'usage: python %s knownGenesfilename radius outfilename [-GENCODE gencode.genelist] [-unique]' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    radius = int(sys.argv[2])
    outputfilename = sys.argv[3]

    doGENCODE=False
    doUnique=False
    if '-unique' in sys.argv:
        doUnique=True
    if '-GENCODE' in sys.argv:
        doGENCODE=True
        print 'will use GENCODE annotation'
        inputfilename=sys.argv[sys.argv.index('-GENCODE') + 1]

    outfile = open(outputfilename, 'w')

    listoflines = open(inputfilename)
    lineslist = listoflines.readlines()
    TSSDict={}
    if doGENCODE:
        for line in lineslist:
            fields=line.split('\t')
            if TSSDict.has_key(fields[1]):
                if fields[4]=='+':
                    TSSDict[fields[1]].append((fields[0],'+',int(fields[2])))
                if fields[4]=='-':
                    TSSDict[fields[1]].append((fields[0],'-',int(fields[3])))
            else:
                TSSDict[fields[1]]=[]
                if fields[4]=='+':
                    TSSDict[fields[1]].append((fields[0],'+',int(fields[2])))
                if fields[4]=='-':
                    TSSDict[fields[1]].append((fields[0],'-',int(fields[3])))
    else:
        for line in lineslist:
            fields=line.split('\t')
            if TSSDict.has_key(fields[1]):
                if fields[2]=='+':
                    TSSDict[fields[1]].append((fields[0],'+',int(fields[3])))
                if fields[2]=='-':
                    TSSDict[fields[1]].append((fields[0],'-',int(fields[4])))
            else:
                TSSDict[fields[1]]=[]
                if fields[2]=='+':
                    TSSDict[fields[1]].append((fields[0],'+',int(fields[3])))
                if fields[2]=='-':
                    TSSDict[fields[1]].append((fields[0],'-',int(fields[4])))
    chromList=TSSDict.keys()
    chromList.sort()
    if doUnique:
        TSSUniqueDict={}
        for chr in chromList: 
            TSSUniqueDict[chr]=[]
            for (UCSCID,orientation,TSS) in TSSDict[chr]:
                TSSUniqueDict[chr].append((orientation,TSS))
            TSSUniqueDict[chr]=Set(TSSUniqueDict[chr])
            TSSUniqueDict[chr]=list(TSSUniqueDict[chr])
            TSSUniqueDict[chr].sort()
            for (orientation,TSS) in TSSUniqueDict[chr]:
                outline=chr+'\t'+str(TSS-radius)+'\t'+str(TSS+radius)+'\t'+orientation+'\n'
                outfile.write(outline)
    else:
        for chr in chromList: 
            TSSDict[chr]=Set(TSSDict[chr])
            TSSDict[chr]=list(TSSDict[chr])
            TSSDict[chr].sort()
            for (UCSCID,orientation,TSS) in TSSDict[chr]:
                outline=UCSCID+'\t'+chr+'\t'+str(TSS-radius)+'\t'+str(TSS+radius)+'\t'+orientation+'\n'
                outfile.write(outline)
    outfile.close()

run()

