##################################
#                                #
# Last modified 12/3/2009         # 
#                                #
# Georgi Marinov                 #
#                                # 
##################################

import sys
import string
import math
from cistematic.core import Genome
from cistematic.core.geneinfo import geneinfoDB
from commoncode import *

def getSequence(hg,chromosome,start,stop,sense):
    
    DNA = {'A':'T','T':'A','G':'C','C':'G','N':'N'}
    chromosome = chromosome[3:len(chromosome)]
    if sense=='F':
        try:
            sequence = string.upper(hg.sequence(chromosome,start,stop-start))
        except:
            sequence=''
    if sense=='R':
        try:
            preliminarysequence = string.upper(hg.sequence(chromosome,start,stop-start))
        except:
            preliminarysequence=''
        sequence=''
        for i in range(len(preliminarysequence)):
            sequence=sequence+DNA[preliminarysequence[len(preliminarysequence)-i-1]]
    
    return sequence

def run():

    if len(sys.argv) < 2:
        print 'usage: python %s genome outputfilename ' % sys.argv[0]
        sys.exit(1)
    
    genome = sys.argv[1]
    outfilename = sys.argv[2]

    outfile = open(outfilename, 'w')

    genes = {}
    hg = Genome(genome)
    idb = geneinfoDB()
    geneinfoDict = idb.getallGeneInfo(genome)
    featDict = hg.getallGeneFeatures()
    geneIDs = featDict.keys()
    i=0
    outfile.write('GeneID\tGeneName\tmRNA_length\tGC%\n')
    for k in featDict.keys():
        if i % 1000 == 0:
            print len(featDict.keys())-i 
        i+=1
        sequence=''
        if idb.getGeneInfo((genome,k))==[]:
            name = 'LOC'+str(k)
        else:
            name = idb.getGeneInfo((genome,k))[0]
        chr= 'chr'+str(featDict[k][0][1])
        orientation=str(featDict[k][0][4])
        for feature in featDict[k]:
            start=int(feature[2])
            stop=int(feature[3])
            sequence=sequence+getSequence(hg,chr,start,stop,orientation)
        GC=0
        for pos in range(len(sequence)):
            if sequence[pos] == 'G' or sequence[pos]=='C':
                GC+=1
        try:
            GC=GC/(len(sequence)+0.0)
        except:
            print 'problem with', name, 'skipping'
            continue
        outline = '%s\t%s\t%s\t%s' % (k, name, len(sequence), GC)
        outfile.write(outline + '\n')
    outfile.close()
   
run()
