##################################
#                                #
# Last modified 10/2/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB

try:
	import psyco
	psyco.full()
except:
	pass

def run():

    if len(sys.argv) < 5:
        print 'usage: python %s knownGenesfilename radius genome KnownGeneToLocusFilename outfilename' % sys.argv[0]
        sys.exit(1)

    inputfilename = sys.argv[1]
    radius = int(sys.argv[2])
    outputfilename = sys.argv[5]
    genome = sys.argv[3]
    KnownGeneToLocus = sys.argv[4]
    listoflines = open(KnownGeneToLocus)
    lineslist = listoflines.readlines()
    UCSCtoIDDict={}
    IDtoUCSCDict={}
    for line in lineslist:
        fields=line.strip().split('\t')
        UCSCtoIDDict[fields[0]]=fields[1]
        IDtoUCSCDict[fields[1]]='nonamematch'
    hg = Genome(genome)
    idb = geneinfoDB()
    geneinfoDict = idb.getallGeneInfo(genome)
    featDict = hg.getallGeneFeatures()
    geneIDs = featDict.keys()
    i=0
    for k in featDict.keys():
        if i % 1000 == 0:
            print len(featDict.keys())-i 
        i+=1
        if idb.getGeneInfo((genome,k))==[]:
            name = 'LOC'+str(k)
        else:
            name = idb.getGeneInfo((genome,k))[0]
        IDtoUCSCDict[k]=name
    for UCSCID in UCSCtoIDDict.keys():
        UCSCtoIDDict[UCSCID]=IDtoUCSCDict[UCSCtoIDDict[UCSCID]]

    outfile = open(outputfilename, 'w')

    listoflines = open(inputfilename)
    lineslist = listoflines.readlines()
    TSSDict={}
    for line in lineslist:
        fields=line.split('\t')
        if TSSDict.has_key(fields[1]):
            if fields[2]=='+':
                TSSDict[fields[1]].append((fields[0],'+',int(fields[3])))
            if fields[2]=='-':
                TSSDict[fields[1]].append((fields[0],'-',int(fields[4])))
        else:
            TSSDict[fields[1]]=[]
            if fields[2]=='+':
                TSSDict[fields[1]].append((fields[0],'+',int(fields[3])))
            if fields[2]=='-':
                TSSDict[fields[1]].append((fields[0],'-',int(fields[4])))

    chromList=TSSDict.keys()
    chromList.sort()
    for chr in chromList: 
        TSSDict[chr]=Set(TSSDict[chr])
        TSSDict[chr]=list(TSSDict[chr])
        TSSDict[chr].sort()
        for (UCSCID,orientation,TSS) in TSSDict[chr]:
            if UCSCtoIDDict.has_key(UCSCID):
                outline=UCSCtoIDDict[UCSCID]+'\t'+orientation+'\t'+chr+'\t'+str(TSS-radius)+'\t'+str(TSS+radius)+'\n'
                outfile.write(outline)
            else:
                outline='nonamematch'+'\t'+orientation+'\t'+chr+'\t'+str(TSS-radius)+'\t'+str(TSS+radius)+'\n'
                outfile.write(outline)
    outfile.close()

run()

