##################################
#                                #
# Last modified 5/6/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
from sets import Set
from cistematic.core.geneinfo import geneinfoDB
from cistematic.genomes import Genome

def getSequence(genome,chromosome,start,stop,sense):
    
    hg = Genome(genome)
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N'}
    chromosome = chromosome[3:len(chromosome)]
    if sense=='F':
        sequence = string.upper(hg.sequence(chromosome,start,stop-start))
    if sense=='R':
        preliminarysequence = string.upper(hg.sequence(chromosome,start,stop-start))
        sequence=''
        for i in range(len(preliminarysequence)):
            sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-i-1]]
    
    return sequence

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s  genome outfilename' % sys.argv[0]
        sys.exit(1)

    cachePages = 2000000

    genome = sys.argv[1]
    outfilename = sys.argv[2]

    outfile = open(outfilename, 'w')
    print 'started'
    hg = Genome(genome)
    idb = geneinfoDB()
    geneinfoDict = idb.getallGeneInfo(genome)
    print 'max(geneinfoDict.keys())', max(geneinfoDict.keys())
    featDict = hg.getallGeneFeatures()
    print 'max(featDict.keys())', max(featDict.keys())
    geneIDs = featDict.keys()
    gidDict = {}
    symbolToGidDict = {}

    i=0

#    for gid in geneinfoDict:
#        print gid
#        symbol = geneinfoDict[gid][0][0].strip()
#        symbolToGidDict[gid] = symbol
#        outfile.write(symbol)

    for k in featDict.keys():
        print i
        i+=1
        print k
        start=0
        stop=0
        if idb.getGeneInfo((genome,k))==[]:
            name = 'LOC'+str(k)
        else:
            name = idb.getGeneInfo((genome,k))[0]
#        name = geneinfoDict[k][0][0].strip()
        leftPos=[]
        rightPos=[]
        for feature in featDict[k]:
            if feature[0]!='UTR':
                continue
            if feature[0]=='UTR':
                leftPos.append(feature[2])
                rightPos.append(feature[3])
        sense = str(featDict[k][0][4])
        chromosome = 'chr'+str(featDict[k][0][1])
        if len(chromosome)>5: 
            continue
        if leftPos==[] or rightPos==[]:
            UTR='NA'
            outfile.write('>'+name)
            outfile.write('/n')
            outfile.write(UTR)
            outfile.write('/n')
            continue
        if sense == 'F':
            start = max(leftPos)
            stop = max(rightPos)
        if sense == 'R':
            start = min(leftPos)
            stop = min(rightPos)
        if start == 0 or stop == 0:
            UTR='NA'
        print chromosome
        print start
        print stop
        UTR=getSequence(genome,chromosome,start,stop,sense)
        outfile.write('>'+name)
        outfile.write('\n')
        outfile.write(UTR)
        outfile.write('\n')
   
run()
